table.desc <- function(var1) {
res1 <- c(mean = mean(var1) , sd = sd(var1) , quantile(var1, c(.01 , .1, .5, .9 , .99)) , perc.zero = mean(1*(var1==0)) , N = sum(1*!is.na(var1)))
return(res1)
}
table.1 <- sapply(dep.vars, function(x) table.desc(df[,x]))
colnames(table.1) <- c("Total" , "Inpat." , "ER " , "# of presc.")
pander(table.1)
| Total | Inpat. | ER | # of presc. | |
|---|---|---|---|---|
| mean | 2390 | 645 | 111.7 | 6.248 |
| sd | 9205 | 6531 | 604.7 | 14.38 |
| 1% | 0 | 0 | 0 | 0 |
| 10% | 0 | 0 | 0 | 0 |
| 50% | 435 | 0 | 0 | 1 |
| 90% | 5507 | 0 | 110 | 18 |
| 99% | 29101 | 15208 | 2521 | 69 |
| perc.zero | 0.2086 | 0.9487 | 0.8841 | 0.4604 |
| N | 25479 | 25479 | 25479 | 25479 |
Assuming that the data is in dollars of 2007, we observe that the average total expenditure per patient (per year?) is $2390. The distribution of all kinds of expenditure is severely skewed to the right, most likely due to the presence of large amount of zero levels of expenditure for a large fraction of the population.
In what follows, we start by analyzing only observations with strictly positive values in the dependent variables. For this purposes, we present descriptive statistics for this sub-sample.
table.2 <- sapply(dep.vars, function(x) table.desc(df[,x][df[,x]>0]))
colnames(table.2) <- c("Total" , "Inpat." , "ER " , "# of presc.")
pander(table.2[-which(rownames(table.2) %in% "perc.zero"),])
| Total | Inpat. | ER | # of presc. | |
|---|---|---|---|---|
| mean | 3020 | 12565 | 964 | 11.58 |
| sd | 10255 | 26108 | 1528 | 17.93 |
| 1% | 17 | 105.4 | 18 | 1 |
| 10% | 101 | 1536 | 85.1 | 1 |
| 50% | 771.5 | 6070 | 463 | 5 |
| 90% | 6738 | 26181 | 2274 | 29.2 |
| 99% | 32949 | 103259 | 7605 | 86.52 |
| N | 20164 | 1308 | 2952 | 13749 |
#med.over.mean <- round(sapply(dep.vars, function(x) median(df[,x])/mean(df[,x]) ))
Here we estimate the optimal transformation of the dependent variable of the form:
\[ \begin{align} \frac{y^{\lambda} - 1}{\lambda} &= X\beta + \varepsilon \label{boxcox.1} \end{align} \]
boxcox.w.cov <- function(var1) {
eqn1 <- formula(paste(paste(var1," ~", sep=""), paste(exp.vars, collapse = "+") ))
res1 <- boxcox(eqn1, data = df[ df[,var1] > 0, ], lambda = seq(-.1, .2, length = 100), ylab=var1)
lambda.hat <- res1$x[which(res1$y == max(res1$y))]
# loglik.hat <- max(res1$y)
# tab.boxcox.w<- c(lambda.hat, loglik.hat)
return(lambda.hat)
}
par(mfrow=c(2,2))
table.3 <- round(sapply(dep.vars, function(x) boxcox.w.cov(x)), digits = 4)
title(mtext("Optimal Box-Cox transformation for each dependent variable", side=3, outer=TRUE, line=-3))
Table 3: Box-Cox estimates for transformation parameter \(\lambda\)
| Dep. variable | \(\hat{\lambda}\) |
|---|---|
| totexp | 0.0152 |
| ipexp | 0.1273 |
| erexp | 0.0545 |
| rxtot | -0.0818 |
As we can see all estimates are very close to zero which indicates a that the log transformation would be the most appropriate one.
If we repeat the exercise but now without independent variables we get:
boxcox.wo.cov <- function(var1) {
eqn1 <- formula (paste(var1, 1, sep="~") )
res1 <- boxcox(eqn1, data = df[ df[,var1] > 0, ], lambda = seq(-.1, .2, length = 100), ylab=var1)
lambda.hat <- res1$x[which(res1$y == max(res1$y))]
# loglik.hat <- max(res1$y)
# tab.boxcox.w<- c(lambda.hat, loglik.hat)
return(lambda.hat)
}
par(mfrow=c(2,2))
table.4 <- round(sapply(dep.vars, function(x) boxcox.wo.cov(x)), digits =4)
title(mtext("Optimal Box-Cox transformation for each dependent variable", side=3, outer=TRUE, line=-3))
Table 4: Box-Cox estimates for transformation parameter \(\lambda\), without covariates
| Dep. variable | \(\hat{\lambda}\) |
|---|---|
| totexp | 0 |
| ipexp | 0.1212 |
| erexp | 0.0455 |
| rxtot | -0.1 |
The results are qualitatively very similar. All this suggest that when estimating through OLS we should be using the log transformation.
Here we test for the GLM family that provides the best fit.
glm.family <- function(var1) {
eqn1 <- formula(paste(paste(var1," ~", sep=""), paste(exp.vars, collapse = "+") ))
glm.fit.1 <- glm(formula = eqn1, family = poisson, data = df[ df[,var1] > 0, ])
y.hat <- predict(glm.fit.1, type= "response")
lny.hat <- predict(glm.fit.1, type= "link")
raw2 <- ( df[df[,var1] > 0 , var1] - y.hat )^2
glm.fit.2 <- glm(raw2~lny.hat, family=poisson , data = df[ df[,var1] > 0, ])
test.0 <- linearHypothesis(glm.fit.2, c("lny.hat = 0"), test = "Chisq", vcov = vcovHC(glm.fit.2, type = "HC"))
test.1 <- linearHypothesis(glm.fit.2, c("lny.hat = 1"), test = "Chisq", vcov = vcovHC(glm.fit.2, type = "HC"))
test.2 <- linearHypothesis(glm.fit.2, c("lny.hat = 2"), test = "Chisq", vcov = vcovHC(glm.fit.2, type = "HC"))
test.3 <- linearHypothesis(glm.fit.2, c("lny.hat = 3"), test = "Chisq", vcov = vcovHC(glm.fit.2, type = "HC"))
table.tests <- as.data.frame(rbind(test.0[2,3:4], test.1[2,3:4], test.2[2,3:4], test.3[2,3:4]))
table.tests <- format(table.tests , digits = 3)
table.tests <- c(format(glm.fit.2$coef[2],digits =3) , paste(table.tests[,1], "(", table.tests[,2], ")"))
return(table.tests)
}
table.5 <- sapply(dep.vars, function(x) glm.family(x))
rownames(table.5) <- c("coef" , paste("H0: ", 0:3) )
The following table presents the chi-square statistic (with p-value in parenthesis) for different GLM family tests, for all the dependent variables. Notice that we are still working with only positive values of each dependent variable. Additionally we perform all the calculations using robust standard errors to obtain similar results to the ones in the slides from class.
Table 5: GLM Familily test for all dependant variables
| totexp | ipexp | |
|---|---|---|
| coef | 1.51 | 2.45 |
| H0: 0 | 127.3 ( 1.60e-29 ) | 89.55 ( 2.99e-21 ) |
| H0: 1 | 14.5 ( 1.38e-04 ) | 31.43 ( 2.06e-08 ) |
| H0: 2 | 13.4 ( 2.53e-04 ) | 3.06 ( 8.01e-02 ) |
| H0: 3 | 123.9 ( 8.90e-29 ) | 4.44 ( 3.52e-02 ) |
| erexp | rxtot | |
|---|---|---|
| coef | 1.95 | 1.55 |
| H0: 0 | 101.7940 ( 6.16e-24 ) | 1267 ( 1.38e-277 ) |
| H0: 1 | 24.1635 ( 8.85e-07 ) | 160 ( 1.32e-36 ) |
| H0: 2 | 0.0666 ( 7.96e-01 ) | 107 ( 5.36e-25 ) |
| H0: 3 | 29.5033 ( 5.58e-08 ) | 1108 ( 5.44e-243 ) |
Although we reject the null hypothesis for all values, the closest value to been not rejected is \(H_{0}: \gamma = 2\) for all the dependent variables. Hence we should use the gamma family for our GLM estimations.
Estimate a linear regression model for each dependent variable in logs.
comp.trans <- function(var1) {
eqn2 <- formula(paste(paste("log(",var1,")"," ~", sep=""), paste(exp.vars, collapse = "+") ))
lm.fit.1 <- lm(eqn2, data = df[df[, var1] > 0,] )
lny.hat <- predict(lm.fit.1)
yhat.norm <- exp(lny.hat + 0.5*(((summary(lm.fit.1)$sigma)^2)))
Duan <- mean(exp(lm.fit.1$residuals))
yhat.smear <- (exp(lny.hat))*Duan
yhat.diff <- (yhat.norm - yhat.smear)/df[df[,var1]>0, var1]
table.desc.2 <- c(min = min(yhat.diff) , quantile(yhat.diff, c(.01 , .1,.2, .5, .9 , .99)) , max = max(yhat.diff) , N = sum(1*!is.na(yhat.diff)))
return(table.desc.2)
}
table.6 <- sapply(dep.vars, function(x) comp.trans(x))
colnames(table.6) <- c("Total" , "Inpat." , "ER " , "# of presc.")
The following table presents the differences between the transformation using the normal assumption and the smearing estimator.
Table 6: Distribution of the relative differences between the normal transformation and the smearing tranformation estimator: \((\hat{Y}_{norm} - \hat{Y}_{smear})/Y\)
| Total | Inpat. | ER | # of presc. | |
|---|---|---|---|---|
| min | -80.74 | 0.002237 | 0.004896 | 0.001079 |
| 1% | -2.556 | 0.01254 | 0.01038 | 0.003356 |
| 10% | -0.4209 | 0.05061 | 0.0325 | 0.007795 |
| 20% | -0.2266 | 0.0862 | 0.05435 | 0.01189 |
| 50% | -0.06976 | 0.1716 | 0.1337 | 0.03107 |
| 90% | -0.01104 | 0.6795 | 0.6669 | 0.1121 |
| 99% | -0.002374 | 9.421 | 3.475 | 0.297 |
| max | -0.0001004 | 234 | 16.65 | 0.856 |
| N | 20164 | 1308 | 2952 | 13749 |
The differences in prediction due to the transformation in the total expenditure variable are minor for most of the sample, with the smearing transformation resulting always in larger estimates. For the lower half of the distribution, differences in prediction increase. For the 10th percentile this differences represent 42% of real value of total expenditure. By the last percentile the gap is 255% with the largest difference reaching 8074%.
Transformations on impatient expenditure and ER expenditure share a similar pattern: the normal transformation is always greater than the smearing transformation. This differences are negligible for most of the sample. But for the upper 10th percentile differences become notorious (around 60% for the 10th percentile) and reach high discrepancies in the extremes.
Finally the differences in transformations for the number of prescriptions are behaves similarly as the last two variables, but with differences that are a between one and two orders of magnitude less severe.
Overall we can conclude that this two transformation deliver similar results for almost 90% of their respective samples, but differ dramatically in the tails.
Here we regress the square of the residuals from a linear regression (with the dependent variable in logs) on all the explanatory variables. If there is no heteroskedasticity, the predictors should not have much explanatory power over the residuals. This is formally captured in the f-statistic for the null hypothesis that all the regression coefficients are zero. However given our large sample, we focus our attention not on the p-value but on the statistic itself. We also complement our results by presenting the R-squared of each regression.
heterosk.check <- function(var1) {
eqn2 <- formula(paste(paste("log(",var1,")"," ~", sep=""), paste(exp.vars, collapse = "+") ))
lm.fit.2 <- lm(eqn2, data = df[df[, var1] > 0,] )
resid.sqr <- (lm.fit.2$residuals)^2
eqn3 <- formula(paste("resid.sqr ~", paste(exp.vars, collapse = "+") ))
lm.fit.3 <- lm(eqn3, data = df[df[, var1] > 0,] )
summ.lm <- summary(lm.fit.3)
table.tests <- c("F-stat"= summ.lm$fstatistic[1], "p-value" = pf(summ.lm$fstatistic[1], summ.lm$fstatistic[2], summ.lm$fstatistic[3],lower.tail = FALSE) , R2 = summ.lm$r.squared)
table.tests <- format(table.tests, digits=3)
return(table.tests)
}
table.7 <- sapply(dep.vars, function(x) heterosk.check(x))
colnames(table.7) <- c("Total" , "Inpat." , "ER " , "# of presc.")
Table 7: Testing for heteroskedasticity in our for dependant variables
| Total | Inpat. | ER | # of presc. | |
|---|---|---|---|---|
| F-stat.value | 9.57e+00 | 2.65e+00 | 4.81e+00 | 1.31e+01 |
| p-value.value | 2.42e-39 | 1.01e-05 | 4.99e-15 | 1.50e-57 |
| R2 | 1.27e-02 | 5.30e-02 | 4.25e-02 | 2.50e-02 |
As we can see the explanatory power of the predictors over the squared residuals is negligible. This evidence supports the assumption of homoskedasticy in all four models.
We test for non-linearities in the regressors by using the pregibon “link” test and the Ramsey tests. For the Ramsey test we use the predicted values for a polynomial of degree 4.
non.linear.test <- function(var1) {
y <- df[df[, var1] > 0, var1]
eqn2 <- formula(paste(paste("log(",var1,")"," ~", sep=""), paste(exp.vars, collapse = "+") ))
lm.fit.2 <- lm(eqn2, data = df[df[, var1] > 0,] )
lny.hat <- predict(lm.fit.2)
lm.fit.3 <- lm(log(y)~poly(lny.hat, 2, raw=TRUE))
pregibon <- summary(lm.fit.3)$coef[3,c(1,4)]
reset <- resettest(lm.fit.2, 2:4, type="fitted")
Duan <- mean(exp(lm.fit.2$residuals))
yhat <- (exp(lny.hat))*Duan
decile <- cut(lny.hat, breaks=quantile(lny.hat, probs=seq(0,1, by=0.1)),
include.lowest=TRUE)
raw <- y - yhat
lm.fit.4 <- lm(raw~ as.factor(decile)-1)
test.5 <- linearHypothesis(lm.fit.4, diag(10), rep(0,10), vcov = vcovHC(lm.fit.4, type = "HC"))
hl.test <- as.array(test.5[2,3:4] )
table.tests <- c(pregibon, reset$statistic, reset$p.value, hl.test )
return(table.tests)
}
table.8 <- sapply(dep.vars, function(x) non.linear.test(x))
colnames(table.8) <- c("Total" , "Inpat." , "ER " , "# of presc.")
rownames(table.8) <- c("link.coef", "link.p.value", "RETEST.F", "RETEST.p.value", "HL.F.stat", "HL.p.value")
In table 8 we can see that we reject the null hypothesis of linearity in total expenditures an number of prescriptions. For inpatient and ER expenditures we do not reject the null hypothesis of linearity. It is possible that this results are driven by the larger sample sizes of the first to dependent variables.
Finally we compute the Hosmer-Lemeshow test as a third test for goodness of fit. Using this test we cannot reject the null that all variables are in the right scale.
Table 8: Testing for non linearities and goodness of fit
| Total | Inpat. | ER | # of presc. | |
|---|---|---|---|---|
| link.coef | 0.01819 | -0.1751 | 0.02099 | 0.03204 |
| link.p.value | 0.07554 | 0.2042 | 0.8325 | 0.01629 |
| RETEST.F | 16.25 | 0.7395 | 1.099 | 19.61 |
| RETEST.p.value | 1.524e-10 | 0.5285 | 0.3482 | 1.103e-12 |
| HL.F.stat | 8.023 | 2.552 | 2.343 | 3.499 |
| HL.p.value | 4.833e-13 | 0.004726 | 0.009467 | 0.0001269 |
Following the class material we will estimate the marginal effects for continuous regressors, using the following result:
\[ \begin{align} \frac{\partial E(y)}{\partial x_{k}} &= Pr(y>0)\frac{\partial E(y|y>0)}{\partial x_{k}} + E(y|y>0)\frac{\partial Pr(y>0)}{\partial x_{k}} \label{margin.1} \end{align} \]
Each component is estimated in the next code chunk:
marg.eff.1 <- function(var1) {
# var1 <- dep.vars[1]
df$particip <- 1*(df[, var1]>0)
eqn3 <- formula(paste("particip~", paste(exp.vars, collapse = "+") ) )
glm.fit.3 <- glm(eqn3, family=binomial, data=df)
eqn1 <- formula(paste(paste(var1," ~", sep=""), paste(exp.vars, collapse = "+") ))
glm.fit.4 <- glm(formula = eqn1, family = Gamma(link=log), data = df[ df[,var1] > 0, ])
#Component 1: Pr(y>0) (from logit equation)
comp.1 <- glm.fit.3$fitted.values
#Component 2: dE(y|y>0)/dx_{k} (from equation in logs)
y.link <- predict(glm.fit.4, newdata = df, type="link")
comp.2 <- glm.fit.4$coefficients["faminc"]*y.link
#Component 3: E(y|y>0) (from equation in logs)
Duan <- mean(exp(log(df[,var1]) - y.link))
comp.3 <- y.link*Duan
#Component 4: dPr(y>0)/dx_{k} (from logit equation)
comp.4 <- glm.fit.3$coefficients["faminc"] * exp(glm.fit.3$linear.predictors)/(1+exp(glm.fit.3$linear.predictors))^2
res1 <- 1000 * (comp.1 * comp.2 + comp.3 * comp.4)
return(res1)
}
Following the class material we will estimate the marginal effects for binary regressors, using the following result:
\[ \begin{align} E(y|x_{k}=1) - E(y|x_{k}=0) &= \left\{ Pr(y>0|x_{k}=1) - Pr(y>0|x_{k}=0) \right\}E(y|x_{k}=1, y>0) + \nonumber \\ \nonumber \\ & \left\{ E(y|x_{k}=1, y>0) - E(y|x_{k}=0, y>0) \right\}Pr(y>0|x_{k}=0) \label{margin.2} \end{align} \]
In the following chunk of code we compute each component.
#Comp.1:First term in braces
marg.eff.2 <- function(var1) {
# var1 <- dep.vars[1]
df$particip <- 1*(df[, var1]>0)
eqn3 <- formula(paste("particip~", paste(exp.vars, collapse = "+") ) )
glm.fit.3 <- glm(eqn3, family=binomial, data=df)
eqn1 <- formula(paste(paste(var1," ~", sep=""), paste(exp.vars, collapse = "+") ))
glm.fit.4 <- glm(formula = eqn1, family = Gamma(link=log), data = df[ df[,var1] > 0, ])
df1 <- df
df1$ins <- 1
pr.1 <- predict(glm.fit.3, type="response" , newdata = df1)
df1$ins <- 0
pr.0 <- predict(glm.fit.3, type="response" , newdata = df1)
comp.1 <- pr.1 - pr.0
#Comp.2:Second term in braces
df1$ins <- 1
y.link <- predict(glm.fit.4, newdata = df, type="link")
Duan <- mean(exp(log(df[,var1]) - y.link))
y.link.1 <- predict(glm.fit.4, newdata = df1, type="link")
y.hat.1 <- exp(y.link.1)*Duan
df1$ins <- 0
y.link.0 <- predict(glm.fit.4, newdata = df1, type="link")
Duan <- mean(exp(log(df[,var1]) - y.link))
y.hat.0 <- exp(y.link.0)*Duan
comp.2 <- y.hat.1 - y.hat.0
res1 <- (comp.1 * y.hat.1 + comp.2 * pr.0)
return(res1)
}
table.9 <- sapply(dep.vars, function(x) c(mean(marg.eff.1(x)) , mean(marg.eff.2(x))))
table.9 <- format(table.9, digits=3)
colnames(table.9) <- c("Total" , "Inpat." , "ER " , "# of presc.")
rownames(table.9) <- c("d faminc ($1000)", "d insurance")
Table 9: Marginal effects for continuos and binary regressors
| Total | Inpat. | ER | # of presc. | |
|---|---|---|---|---|
| d faminc ($1000) | 1.674 | 0.404 | 1.322 | 0.065 |
| d insurance | 844.635 | 13.814 | 4.750 | 1.180 |
For total expenditure the marginal effects are similar in magnitude to the one in the slides but not exactly identical. Changes in family income are expressed in $1000 variations. Family income has a small effect across all monetary variables, and the number of prescription (remember that the median number of prescriptions is 1). Getting insurance has a very large effect on total expenditures and number of prescriptions, but once again there are no strong effects over inpatient and ER expenditure.
To be completed in a undetermined future.