Homework #3

Part 0 (Or what I should have done in HW 2): Descriptive statistics.

table.desc      <- function(var1) { 
res1            <- c(mean = mean(var1) , sd = sd(var1) , quantile(var1, c(.01 , .1, .5, .9 , .99)) , perc.zero = mean(1*(var1==0))  , N = sum(1*!is.na(var1))) 
return(res1)
}

table.1         <- sapply(dep.vars, function(x) table.desc(df[,x]))
colnames(table.1)   <- c("Total" , "Inpat." , "ER " , "# of presc.")
pander(table.1)

	Total	Inpat.	ER	# of presc.
mean	2390	645	111.7	6.248
sd	9205	6531	604.7	14.38
1%	0	0	0	0
10%	0	0	0	0
50%	435	0	0	1
90%	5507	0	110	18
99%	29101	15208	2521	69
perc.zero	0.2086	0.9487	0.8841	0.4604
N	25479	25479	25479	25479

Assuming that the data is in dollars of 2007, we observe that the average total expenditure per patient (per year?) is $2390. The distribution of all kinds of expenditure is severely skewed to the right, most likely due to the presence of large amount of zero levels of expenditure for a large fraction of the population.

In what follows, we start by analyzing only observations with strictly positive values in the dependent variables. For this purposes, we present descriptive statistics for this sub-sample.

table.2         <- sapply(dep.vars, function(x) table.desc(df[,x][df[,x]>0]))
colnames(table.2)   <- c("Total" , "Inpat." , "ER " , "# of presc.")
pander(table.2[-which(rownames(table.2) %in% "perc.zero"),])

	Total	Inpat.	ER	# of presc.
mean	3020	12565	964	11.58
sd	10255	26108	1528	17.93
1%	17	105.4	18	1
10%	101	1536	85.1	1
50%	771.5	6070	463	5
90%	6738	26181	2274	29.2
99%	32949	103259	7605	86.52
N	20164	1308	2952	13749

#med.over.mean   <-   round(sapply(dep.vars, function(x) median(df[,x])/mean(df[,x]) ))

Part 1)

a) Estimate the link using Box-Cox transformation.

Here we estimate the optimal transformation of the dependent variable of the form:

\[ \begin{align} \frac{y^{\lambda} - 1}{\lambda} &= X\beta + \varepsilon \label{boxcox.1} \end{align} \]

  boxcox.w.cov  <-  function(var1)  {
    eqn1        <- formula(paste(paste(var1," ~", sep=""), paste(exp.vars, collapse = "+") ))
    res1        <- boxcox(eqn1, data = df[ df[,var1] > 0,  ],  lambda = seq(-.1, .2, length = 100), ylab=var1)
    lambda.hat  <- res1$x[which(res1$y == max(res1$y))]
#     loglik.hat <- max(res1$y)
#     tab.boxcox.w<- c(lambda.hat, loglik.hat) 
    return(lambda.hat)
  }
par(mfrow=c(2,2))
table.3         <- round(sapply(dep.vars, function(x) boxcox.w.cov(x)), digits = 4)
title(mtext("Optimal Box-Cox transformation for each dependent variable", side=3, outer=TRUE, line=-3))

plot of chunk unnamed-chunk-5

Table 3: Box-Cox estimates for transformation parameter $\lambda$

Dep. variable	$\hat{\lambda}$
totexp	0.0152
ipexp	0.1273
erexp	0.0545
rxtot	-0.0818

As we can see all estimates are very close to zero which indicates a that the log transformation would be the most appropriate one.

If we repeat the exercise but now without independent variables we get:

  boxcox.wo.cov  <-  function(var1)  {
    eqn1        <- formula (paste(var1, 1, sep="~") )
    res1        <- boxcox(eqn1, data = df[ df[,var1] > 0,  ],  lambda = seq(-.1, .2, length = 100), ylab=var1)
    lambda.hat  <- res1$x[which(res1$y == max(res1$y))]
#     loglik.hat <- max(res1$y)
#     tab.boxcox.w<- c(lambda.hat, loglik.hat) 
    return(lambda.hat)
  }
par(mfrow=c(2,2))
table.4         <- round(sapply(dep.vars, function(x) boxcox.wo.cov(x)), digits =4)
title(mtext("Optimal Box-Cox transformation for each dependent variable", side=3, outer=TRUE, line=-3))

plot of chunk unnamed-chunk-6

Table 4: Box-Cox estimates for transformation parameter $\lambda$, without covariates

Dep. variable	$\hat{\lambda}$
totexp	0
ipexp	0.1212
erexp	0.0455
rxtot	-0.1

The results are qualitatively very similar. All this suggest that when estimating through OLS we should be using the log transformation.

b) Testing for the best GLM family

Here we test for the GLM family that provides the best fit.

First we estimate a glm model, starting from a a Poisson family.
Second we compute the square residuals in levels (“raw”) and fit another glm now using the predicted values in logs as the only predictor.
Third we compute chi-square statistics for the null hypothesis of the coefficient of the last regression been equal to 0,1,2 or 3. Each coefficient represents a different GLM family.

glm.family      <- function(var1) {
  eqn1          <- formula(paste(paste(var1," ~", sep=""), paste(exp.vars, collapse = "+") ))
  glm.fit.1     <- glm(formula = eqn1, family = poisson, data = df[ df[,var1] > 0,  ])
  y.hat         <- predict(glm.fit.1, type= "response")
  lny.hat       <- predict(glm.fit.1, type= "link")
  raw2          <- ( df[df[,var1] > 0 , var1] - y.hat )^2
  glm.fit.2     <- glm(raw2~lny.hat, family=poisson , data = df[ df[,var1] > 0,  ])
  test.0        <- linearHypothesis(glm.fit.2, c("lny.hat = 0"), test = "Chisq", vcov = vcovHC(glm.fit.2, type = "HC"))
  test.1        <- linearHypothesis(glm.fit.2, c("lny.hat = 1"), test = "Chisq", vcov = vcovHC(glm.fit.2, type = "HC"))
  test.2        <- linearHypothesis(glm.fit.2, c("lny.hat = 2"), test = "Chisq", vcov = vcovHC(glm.fit.2, type = "HC"))
  test.3        <- linearHypothesis(glm.fit.2, c("lny.hat = 3"), test = "Chisq", vcov = vcovHC(glm.fit.2, type = "HC"))
  table.tests   <- as.data.frame(rbind(test.0[2,3:4], test.1[2,3:4], test.2[2,3:4], test.3[2,3:4]))
  
  table.tests   <- format(table.tests , digits = 3)
  table.tests   <- c(format(glm.fit.2$coef[2],digits =3) , paste(table.tests[,1], "(", table.tests[,2], ")"))
  return(table.tests)
}  

 table.5           <- sapply(dep.vars, function(x) glm.family(x))
 rownames(table.5) <- c("coef" , paste("H0: ", 0:3) )

The following table presents the chi-square statistic (with p-value in parenthesis) for different GLM family tests, for all the dependent variables. Notice that we are still working with only positive values of each dependent variable. Additionally we perform all the calculations using robust standard errors to obtain similar results to the ones in the slides from class.

Table 5: GLM Familily test for all dependant variables

Table continues below
	totexp	ipexp
coef	1.51	2.45
H0: 0	127.3 ( 1.60e-29 )	89.55 ( 2.99e-21 )
H0: 1	14.5 ( 1.38e-04 )	31.43 ( 2.06e-08 )
H0: 2	13.4 ( 2.53e-04 )	3.06 ( 8.01e-02 )
H0: 3	123.9 ( 8.90e-29 )	4.44 ( 3.52e-02 )

	erexp	rxtot
coef	1.95	1.55
H0: 0	101.7940 ( 6.16e-24 )	1267 ( 1.38e-277 )
H0: 1	24.1635 ( 8.85e-07 )	160 ( 1.32e-36 )
H0: 2	0.0666 ( 7.96e-01 )	107 ( 5.36e-25 )
H0: 3	29.5033 ( 5.58e-08 )	1108 ( 5.44e-243 )

Although we reject the null hypothesis for all values, the closest value to been not rejected is $H_{0}: \gamma = 2$ for all the dependent variables. Hence we should use the gamma family for our GLM estimations.

c) Comparing transformations

Estimate a linear regression model for each dependent variable in logs.

comp.trans    <- function(var1) {
  eqn2          <- formula(paste(paste("log(",var1,")"," ~", sep=""), paste(exp.vars, collapse = "+") ))
  lm.fit.1      <- lm(eqn2, data = df[df[, var1] > 0,] ) 
  lny.hat       <- predict(lm.fit.1)
  
  yhat.norm     <-  exp(lny.hat + 0.5*(((summary(lm.fit.1)$sigma)^2)))
  
  Duan          <- mean(exp(lm.fit.1$residuals))
  yhat.smear    <- (exp(lny.hat))*Duan
  
  yhat.diff     <- (yhat.norm - yhat.smear)/df[df[,var1]>0, var1] 
  
  table.desc.2  <- c(min = min(yhat.diff) , quantile(yhat.diff, c(.01 , .1,.2, .5, .9 , .99))  , max = max(yhat.diff) , N = sum(1*!is.na(yhat.diff))) 
  
  return(table.desc.2)
}

table.6         <- sapply(dep.vars, function(x) comp.trans(x))
colnames(table.6)   <- c("Total" , "Inpat." , "ER " , "# of presc.")

The following table presents the differences between the transformation using the normal assumption and the smearing estimator.

Table 6: Distribution of the relative differences between the normal transformation and the smearing tranformation estimator: $(\hat{Y}_{norm} - \hat{Y}_{smear})/Y$

	Total	Inpat.	ER	# of presc.
min	-80.74	0.002237	0.004896	0.001079
1%	-2.556	0.01254	0.01038	0.003356
10%	-0.4209	0.05061	0.0325	0.007795
20%	-0.2266	0.0862	0.05435	0.01189
50%	-0.06976	0.1716	0.1337	0.03107
90%	-0.01104	0.6795	0.6669	0.1121
99%	-0.002374	9.421	3.475	0.297
max	-0.0001004	234	16.65	0.856
N	20164	1308	2952	13749

The differences in prediction due to the transformation in the total expenditure variable are minor for most of the sample, with the smearing transformation resulting always in larger estimates. For the lower half of the distribution, differences in prediction increase. For the 10th percentile this differences represent 42% of real value of total expenditure. By the last percentile the gap is 255% with the largest difference reaching 8074%.

Transformations on impatient expenditure and ER expenditure share a similar pattern: the normal transformation is always greater than the smearing transformation. This differences are negligible for most of the sample. But for the upper 10th percentile differences become notorious (around 60% for the 10th percentile) and reach high discrepancies in the extremes.

Finally the differences in transformations for the number of prescriptions are behaves similarly as the last two variables, but with differences that are a between one and two orders of magnitude less severe.

Overall we can conclude that this two transformation deliver similar results for almost 90% of their respective samples, but differ dramatically in the tails.

d) Checking for heteroskedasticity

Here we regress the square of the residuals from a linear regression (with the dependent variable in logs) on all the explanatory variables. If there is no heteroskedasticity, the predictors should not have much explanatory power over the residuals. This is formally captured in the f-statistic for the null hypothesis that all the regression coefficients are zero. However given our large sample, we focus our attention not on the p-value but on the statistic itself. We also complement our results by presenting the R-squared of each regression.

heterosk.check  <- function(var1) {
  eqn2          <- formula(paste(paste("log(",var1,")"," ~", sep=""), paste(exp.vars, collapse = "+") ))
  lm.fit.2      <- lm(eqn2, data = df[df[, var1] > 0,] ) 
  resid.sqr     <- (lm.fit.2$residuals)^2
  eqn3          <- formula(paste("resid.sqr ~", paste(exp.vars, collapse = "+") ))
  lm.fit.3      <- lm(eqn3, data = df[df[, var1] > 0,] )
  summ.lm       <- summary(lm.fit.3)
  table.tests   <- c("F-stat"= summ.lm$fstatistic[1], "p-value" = pf(summ.lm$fstatistic[1], summ.lm$fstatistic[2], summ.lm$fstatistic[3],lower.tail = FALSE) ,  R2 = summ.lm$r.squared) 
  table.tests   <- format(table.tests, digits=3)
  return(table.tests)
  }
table.7         <- sapply(dep.vars, function(x) heterosk.check(x))
colnames(table.7)   <- c("Total" , "Inpat." , "ER " , "# of presc.")

Table 7: Testing for heteroskedasticity in our for dependant variables

	Total	Inpat.	ER	# of presc.
F-stat.value	9.57e+00	2.65e+00	4.81e+00	1.31e+01
p-value.value	2.42e-39	1.01e-05	4.99e-15	1.50e-57
R2	1.27e-02	5.30e-02	4.25e-02	2.50e-02

As we can see the explanatory power of the predictors over the squared residuals is negligible. This evidence supports the assumption of homoskedasticy in all four models.

Part 2

We test for non-linearities in the regressors by using the pregibon “link” test and the Ramsey tests. For the Ramsey test we use the predicted values for a polynomial of degree 4.

non.linear.test <- function(var1) {
  y             <-  df[df[, var1] > 0, var1]
  eqn2          <- formula(paste(paste("log(",var1,")"," ~", sep=""), paste(exp.vars, collapse = "+") ))
  lm.fit.2      <- lm(eqn2, data = df[df[, var1] > 0,] ) 
  lny.hat       <- predict(lm.fit.2) 
  lm.fit.3      <- lm(log(y)~poly(lny.hat, 2, raw=TRUE))
  pregibon      <- summary(lm.fit.3)$coef[3,c(1,4)] 
  reset         <- resettest(lm.fit.2, 2:4, type="fitted")

  
  Duan          <- mean(exp(lm.fit.2$residuals))  
  yhat          <- (exp(lny.hat))*Duan
  decile        <- cut(lny.hat, breaks=quantile(lny.hat, probs=seq(0,1, by=0.1)), 
                                include.lowest=TRUE)
  raw           <- y - yhat 
  lm.fit.4      <- lm(raw~ as.factor(decile)-1)
  test.5        <- linearHypothesis(lm.fit.4, diag(10), rep(0,10), vcov = vcovHC(lm.fit.4, type = "HC"))
  hl.test       <- as.array(test.5[2,3:4]  )
  
  
  table.tests   <- c(pregibon, reset$statistic, reset$p.value, hl.test ) 
  return(table.tests)
}
table.8         <- sapply(dep.vars, function(x) non.linear.test(x))
colnames(table.8)   <- c("Total" , "Inpat." , "ER " , "# of presc.")
rownames(table.8)   <- c("link.coef", "link.p.value", "RETEST.F", "RETEST.p.value", "HL.F.stat", "HL.p.value")

In table 8 we can see that we reject the null hypothesis of linearity in total expenditures an number of prescriptions. For inpatient and ER expenditures we do not reject the null hypothesis of linearity. It is possible that this results are driven by the larger sample sizes of the first to dependent variables.

Finally we compute the Hosmer-Lemeshow test as a third test for goodness of fit. Using this test we cannot reject the null that all variables are in the right scale.

Table 8: Testing for non linearities and goodness of fit

	Total	Inpat.	ER	# of presc.
link.coef	0.01819	-0.1751	0.02099	0.03204
link.p.value	0.07554	0.2042	0.8325	0.01629
RETEST.F	16.25	0.7395	1.099	19.61
RETEST.p.value	1.524e-10	0.5285	0.3482	1.103e-12
HL.F.stat	8.023	2.552	2.343	3.499
HL.p.value	4.833e-13	0.004726	0.009467	0.0001269

Part 3 Marginal effects of the two part model

Following the class material we will estimate the marginal effects for continuous regressors, using the following result:

\[ \begin{align} \frac{\partial E(y)}{\partial x_{k}} &= Pr(y>0)\frac{\partial E(y|y>0)}{\partial x_{k}} + E(y|y>0)\frac{\partial Pr(y>0)}{\partial x_{k}} \label{margin.1} \end{align} \]

Each component is estimated in the next code chunk:

marg.eff.1      <- function(var1) { 
#  var1            <- dep.vars[1]  
  df$particip     <- 1*(df[, var1]>0)
  eqn3            <- formula(paste("particip~", paste(exp.vars, collapse = "+") ) )
  glm.fit.3       <- glm(eqn3, family=binomial, data=df)    
  
  eqn1            <- formula(paste(paste(var1," ~", sep=""), paste(exp.vars, collapse = "+") ))
  glm.fit.4       <- glm(formula = eqn1, family = Gamma(link=log), data = df[ df[,var1] > 0,  ])
  
  
  #Component 1: Pr(y>0) (from logit equation)
  comp.1          <- glm.fit.3$fitted.values
  
  #Component 2: dE(y|y>0)/dx_{k} (from equation in logs)
  y.link          <- predict(glm.fit.4, newdata = df, type="link")
  comp.2          <- glm.fit.4$coefficients["faminc"]*y.link
  
  #Component 3: E(y|y>0) (from equation in logs)
  Duan            <- mean(exp(log(df[,var1]) - y.link))  
  comp.3          <- y.link*Duan 
  
  #Component 4: dPr(y>0)/dx_{k} (from logit equation)
  comp.4          <- glm.fit.3$coefficients["faminc"] * exp(glm.fit.3$linear.predictors)/(1+exp(glm.fit.3$linear.predictors))^2
  
  res1            <- 1000 * (comp.1 * comp.2 + comp.3 * comp.4)
  return(res1)
}

Following the class material we will estimate the marginal effects for binary regressors, using the following result:

\[ \begin{align} E(y|x_{k}=1) - E(y|x_{k}=0) &= \left\{ Pr(y>0|x_{k}=1) - Pr(y>0|x_{k}=0) \right\}E(y|x_{k}=1, y>0) + \nonumber \\ \nonumber \\ & \left\{ E(y|x_{k}=1, y>0) - E(y|x_{k}=0, y>0) \right\}Pr(y>0|x_{k}=0) \label{margin.2} \end{align} \]

In the following chunk of code we compute each component.

#Comp.1:First term in braces
marg.eff.2      <- function(var1) { 
#  var1            <- dep.vars[1]  
  df$particip     <- 1*(df[, var1]>0)
  eqn3            <- formula(paste("particip~", paste(exp.vars, collapse = "+") ) )
  glm.fit.3       <- glm(eqn3, family=binomial, data=df)    

  eqn1            <- formula(paste(paste(var1," ~", sep=""), paste(exp.vars, collapse = "+") ))
  glm.fit.4       <- glm(formula = eqn1, family = Gamma(link=log), data = df[ df[,var1] > 0,  ])

  df1             <- df
  df1$ins         <- 1
  pr.1            <- predict(glm.fit.3, type="response" , newdata = df1)
  df1$ins         <- 0
  pr.0            <- predict(glm.fit.3, type="response" , newdata = df1)
  comp.1          <- pr.1 - pr.0 
  
  #Comp.2:Second term in braces
  df1$ins         <- 1
  y.link          <- predict(glm.fit.4, newdata = df, type="link")
  Duan            <- mean(exp(log(df[,var1]) - y.link))  

  y.link.1        <- predict(glm.fit.4, newdata = df1, type="link")
  y.hat.1         <- exp(y.link.1)*Duan 
  
  df1$ins         <- 0
  y.link.0        <- predict(glm.fit.4, newdata = df1, type="link")
  Duan            <- mean(exp(log(df[,var1]) - y.link))  
  y.hat.0         <- exp(y.link.0)*Duan 
  
  comp.2          <- y.hat.1 - y.hat.0
  
  res1            <- (comp.1 * y.hat.1 + comp.2 * pr.0) 
  return(res1)
}

table.9           <- sapply(dep.vars, function(x) c(mean(marg.eff.1(x)) , mean(marg.eff.2(x)))) 
table.9           <- format(table.9, digits=3) 
colnames(table.9) <- c("Total" , "Inpat." , "ER " , "# of presc.")
rownames(table.9) <- c("d faminc ($1000)", "d insurance")

Table 9: Marginal effects for continuos and binary regressors

	Total	Inpat.	ER	# of presc.
d faminc ($1000)	1.674	0.404	1.322	0.065
d insurance	844.635	13.814	4.750	1.180

For total expenditure the marginal effects are similar in magnitude to the one in the slides but not exactly identical. Changes in family income are expressed in $1000 variations. Family income has a small effect across all monetary variables, and the number of prescription (remember that the median number of prescriptions is 1). Getting insurance has a very large effect on total expenditures and number of prescriptions, but once again there are no strong effects over inpatient and ER expenditure.

Part 4

To be completed in a undetermined future.