Exercise 1

Check out the data set “data_health_insurance_experiment.csv”, a randomized controlled trial, which was collected to study of the effect of expanding public health insurance on health care use, health outcomes, financial strain, and well-being of low-income adults.

# Variable names:
# Y: Any Emergency Department visit in the study period
# D: Ever enrolled in Medicaid in the study period
# Z: Selected in the lottery
# numhh_1: household head + 1
# numhh_2: household head + 2
# english: individual requested english-language materials
# female dummy
# year of birth

## You can find more information about the experiment and the data here:
## https://www.nber.org/programs-projects/projects-and-centers/oregon-health-insurance-experiment?page=1&perPage=50

data <- read.csv("data_health_insurance_experiment.csv")

Ex. 1.a

## OLS
ols <- lm(data = data, formula = Y ~ D)
ols_controls <- lm(data = data, formula = Y ~ D + numhh_1 + numhh_2 + birthyear + english + female)
library(jtools)
summ(ols, digits = 4, robust = TRUE)
## MODEL INFO:
## Observations: 24646
## Dependent Variable: Y
## Type: OLS linear regression 
## 
## MODEL FIT:
## F(1,24644) = 577.0421, p = 0.0000
## R² = 0.0229
## Adj. R² = 0.0228 
## 
## Standard errors: Robust, type = HC3
## ------------------------------------------------------
##                       Est.     S.E.    t val.        p
## ----------------- -------- -------- --------- --------
## (Intercept)         0.3026   0.0034   89.8342   0.0000
## D                   0.1670   0.0073   23.0272   0.0000
## ------------------------------------------------------
summ(ols_controls, digits = 4, robust = TRUE)
## MODEL INFO:
## Observations: 24646
## Dependent Variable: Y
## Type: OLS linear regression 
## 
## MODEL FIT:
## F(6,24639) = 240.4794, p = 0.0000
## R² = 0.0553
## Adj. R² = 0.0551 
## 
## Standard errors: Robust, type = HC3
## --------------------------------------------------------
##                        Est.     S.E.     t val.        p
## ----------------- --------- -------- ---------- --------
## (Intercept)          1.0735   0.4779     2.2464   0.0247
## D                    0.1701   0.0071    23.8549   0.0000
## numhh_1             -0.0967   0.0071   -13.5271   0.0000
## numhh_2             -0.1031   0.0543    -1.8998   0.0575
## birthyear           -0.0005   0.0002    -1.9240   0.0544
## english              0.1934   0.0074    26.1743   0.0000
## female              -0.0002   0.0059    -0.0277   0.9779
## --------------------------------------------------------

Ex. 1.b

## Double-ML-OLS
library(DoubleML)
library(mlr3)
library(mlr3learners)
library(ggplot2)
library(data.table)
library(ivreg)
library(ranger)
# Specify the data and variables for the causal model
dml_data = DoubleMLData$new(data,
                            y_col = "Y",
                            d_cols = "D",
                            x_cols = c("numhh_1", "numhh_2", "birthyear",
                                       "english", "female"))
print(dml_data)
## ================= DoubleMLData Object ==================
## 
## 
## ------------------ Data summary      ------------------
## Outcome variable: Y
## Treatment variable(s): D
## Covariates: numhh_1, numhh_2, birthyear, english, female
## Instrument(s): 
## No. Observations: 24646
# surpress messages from mlr3 package during fitting
lgr::get_logger("mlr3")$set_threshold("warn")

Learners to estimate the nuisance models:

Version 1: Random forest

# Option mtry usually set to the square root of the number of controls: mtry=floor(sqrt(n_vars))
forest = lrn("regr.ranger", num.trees=500, mtry=5, max.depth=5, min.node.size=2)
forest
## <LearnerRegrRanger:regr.ranger>: Random Forest
## * Model: -
## * Parameters: max.depth=5, min.node.size=2, mtry=5, num.threads=1,
##   num.trees=500
## * Packages: mlr3, mlr3learners, ranger
## * Predict Types:  [response], se
## * Feature Types: logical, integer, numeric, character, factor, ordered
## * Properties: hotstart_backward, importance, oob_error, weights

Version 2: Lasso

#learner = lrn("regr.glmnet", lambda = sqrt(log(n_vars)/(n_obs)))
lasso = lrn("regr.cv_glmnet")
lasso
## <LearnerRegrCVGlmnet:regr.cv_glmnet>: GLM with Elastic Net Regularization
## * Model: -
## * Parameters: family=gaussian
## * Packages: mlr3, mlr3learners, glmnet
## * Predict Types:  [response]
## * Feature Types: logical, integer, numeric
## * Properties: selected_features, weights

Version 3: Regression Trees

trees = lrn("regr.rpart")

set.seed(10101)
dml_plr_forest = DoubleMLPLR$new(dml_data, ml_l=forest, ml_m=forest)
dml_plr_forest$fit()
print(dml_plr_forest)
## ================= DoubleMLPLR Object ==================
## 
## 
## ------------------ Data summary      ------------------
## Outcome variable: Y
## Treatment variable(s): D
## Covariates: numhh_1, numhh_2, birthyear, english, female
## Instrument(s): 
## No. Observations: 24646
## 
## ------------------ Score & algorithm ------------------
## Score function: partialling out
## DML algorithm: dml2
## 
## ------------------ Machine learner   ------------------
## ml_l: regr.ranger
## ml_m: regr.ranger
## 
## ------------------ Resampling        ------------------
## No. folds: 5
## No. repeated sample splits: 1
## Apply cross-fitting: TRUE
## 
## ------------------ Fit summary       ------------------
##  Estimates and significance testing of the effect of target variables
##   Estimate. Std. Error t value Pr(>|t|)    
## D  0.171169   0.007145   23.96   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dml_plr_lasso = DoubleMLPLR$new(dml_data, ml_l=lasso, ml_m=lasso)
dml_plr_lasso$fit()
print(dml_plr_lasso)
## ================= DoubleMLPLR Object ==================
## 
## 
## ------------------ Data summary      ------------------
## Outcome variable: Y
## Treatment variable(s): D
## Covariates: numhh_1, numhh_2, birthyear, english, female
## Instrument(s): 
## No. Observations: 24646
## 
## ------------------ Score & algorithm ------------------
## Score function: partialling out
## DML algorithm: dml2
## 
## ------------------ Machine learner   ------------------
## ml_l: regr.cv_glmnet
## ml_m: regr.cv_glmnet
## 
## ------------------ Resampling        ------------------
## No. folds: 5
## No. repeated sample splits: 1
## Apply cross-fitting: TRUE
## 
## ------------------ Fit summary       ------------------
##  Estimates and significance testing of the effect of target variables
##   Estimate. Std. Error t value Pr(>|t|)    
## D  0.169039   0.007133    23.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dml_plr_trees = DoubleMLPLR$new(dml_data, ml_l=trees, ml_m=trees)
dml_plr_trees$fit()
print(dml_plr_trees)
## ================= DoubleMLPLR Object ==================
## 
## 
## ------------------ Data summary      ------------------
## Outcome variable: Y
## Treatment variable(s): D
## Covariates: numhh_1, numhh_2, birthyear, english, female
## Instrument(s): 
## No. Observations: 24646
## 
## ------------------ Score & algorithm ------------------
## Score function: partialling out
## DML algorithm: dml2
## 
## ------------------ Machine learner   ------------------
## ml_l: regr.rpart
## ml_m: regr.rpart
## 
## ------------------ Resampling        ------------------
## No. folds: 5
## No. repeated sample splits: 1
## Apply cross-fitting: TRUE
## 
## ------------------ Fit summary       ------------------
##  Estimates and significance testing of the effect of target variables
##   Estimate. Std. Error t value Pr(>|t|)    
## D  0.167527   0.007137   23.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Ex. 1.c

iv <- ivreg(data = data, Y ~ numhh_1 + numhh_2 + birthyear + english + female | D | Z)
summary(iv)
## 
## Call:
## ivreg(formula = Y ~ numhh_1 + numhh_2 + birthyear + english + 
##     female | D | Z, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4544 -0.3720 -0.2773  0.6152  0.9357 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.9462085  0.4839622   1.955  0.05058 .  
## D            0.0696273  0.0249371   2.792  0.00524 ** 
## numhh_1     -0.0941450  0.0076821 -12.255  < 2e-16 ***
## numhh_2     -0.1118885  0.0644765  -1.735  0.08269 .  
## birthyear   -0.0003925  0.0002460  -1.596  0.11060    
## english      0.1934575  0.0089655  21.578  < 2e-16 ***
## female       0.0085460  0.0063229   1.352  0.17652    
## 
## Diagnostic tests:
##                    df1   df2 statistic  p-value    
## Weak instruments     1 24639   2044.86  < 2e-16 ***
## Wu-Hausman           1 24638     17.75 2.52e-05 ***
## Sargan               0    NA        NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4636 on 24639 degrees of freedom
## Multiple R-Squared: 0.04712, Adjusted R-squared: 0.04689 
## Wald test: 138.5 on 6 and 24639 DF,  p-value: < 2.2e-16

Ex. 1.d

# Specify the data and variables for the causal model
dml_data_iv = DoubleMLData$new(data,
                               y_col = "Y",
                               d_cols = "D",
                               z_cols = "Z",
                               x_cols = c("numhh_1", "numhh_2", "birthyear",
                                          "english", "female"))
print(dml_data_iv)
## ================= DoubleMLData Object ==================
## 
## 
## ------------------ Data summary      ------------------
## Outcome variable: Y
## Treatment variable(s): D
## Covariates: numhh_1, numhh_2, birthyear, english, female
## Instrument(s): Z
## No. Observations: 24646
# surpress messages from mlr3 package during fitting
lgr::get_logger("mlr3")$set_threshold("warn")

Learners to estimate the nuisance models:

Version 1: Random forest

# Option mtry usually set to the square root of the number of controls: mtry=floor(sqrt(n_vars))
forest = lrn("regr.ranger", num.trees=500, mtry=5, max.depth=5, min.node.size=2)
forest
## <LearnerRegrRanger:regr.ranger>: Random Forest
## * Model: -
## * Parameters: max.depth=5, min.node.size=2, mtry=5, num.threads=1,
##   num.trees=500
## * Packages: mlr3, mlr3learners, ranger
## * Predict Types:  [response], se
## * Feature Types: logical, integer, numeric, character, factor, ordered
## * Properties: hotstart_backward, importance, oob_error, weights

Version 2: Lasso

#learner = lrn("regr.glmnet", lambda = sqrt(log(n_vars)/(n_obs)))
lasso = lrn("regr.cv_glmnet")
lasso
## <LearnerRegrCVGlmnet:regr.cv_glmnet>: GLM with Elastic Net Regularization
## * Model: -
## * Parameters: family=gaussian
## * Packages: mlr3, mlr3learners, glmnet
## * Predict Types:  [response]
## * Feature Types: logical, integer, numeric
## * Properties: selected_features, weights

Version 3: Regression Trees

trees = lrn("regr.rpart")
trees
## <LearnerRegrRpart:regr.rpart>: Regression Tree
## * Model: -
## * Parameters: xval=0
## * Packages: mlr3, rpart
## * Predict Types:  [response]
## * Feature Types: logical, integer, numeric, factor, ordered
## * Properties: importance, missings, selected_features, weights

Version 4: XGBoost

boost = lrn("regr.xgboost", objective="reg:squarederror", eta=0.1, nrounds=35)
boost
## <LearnerRegrXgboost:regr.xgboost>: Extreme Gradient Boosting
## * Model: -
## * Parameters: eta=0.1, nrounds=35, nthread=1,
##   objective=reg:squarederror, verbose=0
## * Validate: NULL
## * Packages: mlr3, mlr3learners, xgboost
## * Predict Types:  [response]
## * Feature Types: logical, integer, numeric
## * Properties: hotstart_forward, importance, internal_tuning, missings,
##   validation, weights

DoubleML for PLIV

library(xgboost)
set.seed(10103)
dml_pliv_forest = DoubleMLPLIV$new(dml_data_iv, ml_l=forest, ml_m=forest, ml_r=forest)
dml_pliv_forest$fit()
print(dml_pliv_forest)
## ================= DoubleMLPLIV Object ==================
## 
## 
## ------------------ Data summary      ------------------
## Outcome variable: Y
## Treatment variable(s): D
## Covariates: numhh_1, numhh_2, birthyear, english, female
## Instrument(s): Z
## No. Observations: 24646
## 
## ------------------ Score & algorithm ------------------
## Score function: partialling out
## DML algorithm: dml2
## 
## ------------------ Machine learner   ------------------
## ml_l: regr.ranger
## ml_m: regr.ranger
## ml_r: regr.ranger
## 
## ------------------ Resampling        ------------------
## No. folds: 5
## No. repeated sample splits: 1
## Apply cross-fitting: TRUE
## 
## ------------------ Fit summary       ------------------
##  Estimates and significance testing of the effect of target variables
##   Estimate. Std. Error t value Pr(>|t|)   
## D   0.07300    0.02479   2.944  0.00324 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dml_pliv_lasso = DoubleMLPLIV$new(dml_data_iv, ml_l=lasso, ml_m=lasso, ml_r=lasso)
dml_pliv_lasso$fit()
print(dml_pliv_lasso)
## ================= DoubleMLPLIV Object ==================
## 
## 
## ------------------ Data summary      ------------------
## Outcome variable: Y
## Treatment variable(s): D
## Covariates: numhh_1, numhh_2, birthyear, english, female
## Instrument(s): Z
## No. Observations: 24646
## 
## ------------------ Score & algorithm ------------------
## Score function: partialling out
## DML algorithm: dml2
## 
## ------------------ Machine learner   ------------------
## ml_l: regr.cv_glmnet
## ml_m: regr.cv_glmnet
## ml_r: regr.cv_glmnet
## 
## ------------------ Resampling        ------------------
## No. folds: 5
## No. repeated sample splits: 1
## Apply cross-fitting: TRUE
## 
## ------------------ Fit summary       ------------------
##  Estimates and significance testing of the effect of target variables
##   Estimate. Std. Error t value Pr(>|t|)  
## D   0.05984    0.02505   2.389   0.0169 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dml_pliv_trees = DoubleMLPLIV$new(dml_data_iv, ml_l=trees, ml_m=trees, ml_r=trees)
dml_pliv_trees$fit()
print(dml_pliv_trees)
## ================= DoubleMLPLIV Object ==================
## 
## 
## ------------------ Data summary      ------------------
## Outcome variable: Y
## Treatment variable(s): D
## Covariates: numhh_1, numhh_2, birthyear, english, female
## Instrument(s): Z
## No. Observations: 24646
## 
## ------------------ Score & algorithm ------------------
## Score function: partialling out
## DML algorithm: dml2
## 
## ------------------ Machine learner   ------------------
## ml_l: regr.rpart
## ml_m: regr.rpart
## ml_r: regr.rpart
## 
## ------------------ Resampling        ------------------
## No. folds: 5
## No. repeated sample splits: 1
## Apply cross-fitting: TRUE
## 
## ------------------ Fit summary       ------------------
##  Estimates and significance testing of the effect of target variables
##   Estimate. Std. Error t value Pr(>|t|)   
## D   0.06589    0.02507   2.628  0.00859 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
dml_pliv_xgboost = DoubleMLPLIV$new(dml_data_iv, ml_l=boost, ml_m=boost, ml_r=boost)
dml_pliv_xgboost$fit()
print(dml_pliv_xgboost)
## ================= DoubleMLPLIV Object ==================
## 
## 
## ------------------ Data summary      ------------------
## Outcome variable: Y
## Treatment variable(s): D
## Covariates: numhh_1, numhh_2, birthyear, english, female
## Instrument(s): Z
## No. Observations: 24646
## 
## ------------------ Score & algorithm ------------------
## Score function: partialling out
## DML algorithm: dml2
## 
## ------------------ Machine learner   ------------------
## ml_l: regr.xgboost
## ml_m: regr.xgboost
## ml_r: regr.xgboost
## 
## ------------------ Resampling        ------------------
## No. folds: 5
## No. repeated sample splits: 1
## Apply cross-fitting: TRUE
## 
## ------------------ Fit summary       ------------------
##  Estimates and significance testing of the effect of target variables
##   Estimate. Std. Error t value Pr(>|t|)   
## D   0.06885    0.02486    2.77  0.00561 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Plot results
confints = rbind(confint(iv)["D", ],
                 dml_pliv_forest$confint(), dml_pliv_lasso$confint(),
                 dml_pliv_trees$confint(), dml_pliv_xgboost$confint())
estimates = c(coef(iv)["D"],
              dml_pliv_forest$coef, dml_pliv_lasso$coef,
              dml_pliv_trees$coef, dml_pliv_xgboost$coef)
result_pliv = data.table("model" = "PLIV",
                         "ML" = c("2SLS", "ranger", "glmnet", "rpart", "xgboost"),
                         "Estimate" = estimates,
                         "lower" = confints[,1],
                         "upper" = confints[,2])
result_pliv$ML = factor(result_pliv$ML, c("2SLS", "ranger", "glmnet", "rpart", "xgboost"))
result_pliv
##     model      ML   Estimate      lower     upper
##    <char>  <fctr>      <num>      <num>     <num>
## 1:   PLIV    2SLS 0.06962734 0.02074912 0.1185056
## 2:   PLIV  ranger 0.07300334 0.02440888 0.1215978
## 3:   PLIV  glmnet 0.05983663 0.01074140 0.1089319
## 4:   PLIV   rpart 0.06589218 0.01674871 0.1150357
## 5:   PLIV xgboost 0.06884929 0.02012969 0.1175689
g_ci = ggplot(result_pliv, aes(x = ML, y = Estimate, color = ML)) +
  geom_point() +
  geom_errorbar(aes(ymin = lower, ymax = upper, color = ML))  +
  geom_hline(yintercept = 0, color = "grey") +
  theme_minimal() + ylab("Coefficients and 0.95- confidence interval") +
  xlab("") +
  ggtitle("PLIV") +
  theme(axis.text.x = element_text(angle = 90), legend.position = "none",
        text = element_text(size = 20))

g_ci

Exercise 2

We want to explore the identification of Local Average Treatment Effects (LATEs) and test for violations of instrumental variable assumptions using the LATEtest package in R. The package uses causal forests to search for and test local violations in a data-driven way. Your task is to conduct a LATE analysis and investigate potential violations of the instrument validity assumption.

LATEtest

library(devtools)
## Caricamento del pacchetto richiesto: usethis
#install_github("farbmacher/LATEtest")
library(LATEtest)
library(rpart.plot)
## Caricamento del pacchetto richiesto: rpart
df<-data
Xvars <- paste0(c('numhh_1','numhh_2','birthyear','english','female')) #covariates
df$Y <- as.vector(df[['Y']])#Outcome
df$D <- as.vector(df[['D']])#Treatment
df$Z <- as.vector(df[['Z']])#instrument

Exercise 2.a

test <- LATEtest(data=df, covars=Xvars, alpha=0.05, minsize=100)
maxtree <- eval(parse(text=paste("test$treelist$tree_",test$maxTtree$label,test$maxTtree$J,sep="")))
rpart.plot(maxtree,extra = 101,box.palette="GyRd",shadow.col="gray",nn=TRUE,roundint=FALSE)

test
## $treelist
## $treelist$tree_Q1A1
## n= 12323 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 12323 7299.4720 -0.13721280  
##   2) birthyear< 1951.5 1186  971.9999 -0.25391750 *
##   3) birthyear>=1951.5 11137 6309.5980 -0.12478460  
##     6) birthyear< 1973.5 6330 3527.4810 -0.14633180 *
##     7) birthyear>=1973.5 4807 2775.3080 -0.09641076 *
## 
## $treelist$tree_Q1B1
## n= 12323 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 12323 7876.577 -0.1515664 *
## 
## $treelist$tree_Q1A2
## n= 12323 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 12323 6690.1160 -0.11671090  
##   2) english>=0.5 10666 6207.5490 -0.12753170 *
##   3) english< 0.5 1657  473.2793 -0.04705769 *
## 
## $treelist$tree_Q1B2
## n= 12323 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 12323 6665.0900 -0.09880377  
##   2) english>=0.5 10645 6266.6670 -0.10964220  
##     4) birthyear>=1952.5 9408 5649.9570 -0.11849910  
##       8) birthyear< 1968.5 4045 2707.5450 -0.15135740 *
##       9) birthyear>=1968.5 5363 2934.7510 -0.09371596 *
##     5) birthyear< 1952.5 1237  610.3591 -0.04228154 *
##   3) english< 0.5 1678  389.2395 -0.03004614 *
## 
## $treelist$tree_Q0A1
## n= 12323 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 12323 13007.150 -0.1605121  
##   2) birthyear< 1973.5 7516  7856.622 -0.1905979 *
##   3) birthyear>=1973.5 4807  5133.088 -0.1134713 *
## 
## $treelist$tree_Q0B1
## n= 12323 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 12323 13107.190 -0.1537689  
##   2) birthyear< 1975.5 8096  8517.112 -0.1723714 *
##   3) birthyear>=1975.5 4227  4581.911 -0.1181395 *
## 
## $treelist$tree_Q0A2
## n= 12323 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 12323 8706.669 -0.08909228  
##   2) numhh_1< 0.5 9831 7304.932 -0.10250490 *
##   3) numhh_1>=0.5 2492 1392.991 -0.03617909 *
## 
## $treelist$tree_Q0B2
## n= 12323 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 12323 8669.2250 -0.08842468  
##   2) english>=0.5 10645 8072.6930 -0.09835373 *
##   3) english< 0.5 1678  588.8252 -0.02543615 *
## 
## 
## $maxTtree
## $maxTtree$label
## [1] "Q0B"
## 
## $maxTtree$J
## [1] 2
## 
## 
## $descriptives
## $descriptives$meanY_D_Z
##      Mean      Mean      Mean 
## 0.3435040 0.2451513 0.3905705 
## 
## $descriptives$min_mean_maxZhatA
##      Min.      Mean      Max. 
## 0.2012062 0.3900155 0.7467511 
## 
## $descriptives$min_mean_maxZhatB
##      Min.      Mean      Max. 
## 0.2099685 0.3906398 0.7218264 
## 
## 
## $parameters
## $parameters$covars
## [1] "numhh_1"   "numhh_2"   "birthyear" "english"   "female"   
## 
## $parameters$slice
## [1] "equidistant"
## 
## $parameters$subsets
## [1] 4
## 
## $parameters$minsize
## [1] 100
## 
## $parameters$alpha
## [1] 0.05
## 
## $parameters$huge
## [1] FALSE
## 
## 
## $leafinfo
##    label J knot n_train Tstat_train is_relevant is_largest sep n_est  Tstat_est
## 1    Q1A 1    2    1186   -9.651124       FALSE       TRUE  |   1187  -7.174851
## 2    Q1A 1    6    6330  -15.593419       FALSE      FALSE  |   6343 -16.495115
## 3    Q1A 1    7    4807   -8.795361       FALSE       TRUE  |   4793 -11.267660
## 4    Q1B 1    1   12323  -21.043369       FALSE       TRUE  |  12323 -19.789279
## 5    Q1A 2    2   10666  -17.263112       FALSE      FALSE  |  10645 -14.742271
## 6    Q1A 2    3    1657   -3.582055       FALSE       TRUE  |   1678  -2.553955
## 7    Q1B 2    3    1678   -2.553955       FALSE       TRUE  |   1657  -3.582055
## 8    Q1B 2    5    1237   -2.115322        TRUE       TRUE  |   1253  -5.254173
## 9    Q1B 2    8    4045  -11.763237       FALSE      FALSE  |   4083 -12.312726
## 10   Q1B 2    9    5363   -9.275872       FALSE       TRUE  |   5330 -11.032464
## 11   Q0A 1    2    7516  -16.159545       FALSE      FALSE  |   7530 -14.163542
## 12   Q0A 1    3    4807   -7.611681       FALSE       TRUE  |   4793  -8.802597
## 13   Q0B 1    2    8096  -15.119444       FALSE      FALSE  |   8137 -16.121321
## 14   Q0B 1    3    4227   -7.375668       FALSE       TRUE  |   4186  -7.322763
## 15   Q0A 2    2    9831  -11.789358       FALSE      FALSE  |   9864 -11.218187
## 16   Q0A 2    3    2492   -2.414666       FALSE       TRUE  |   2459  -3.527070
## 17   Q0B 2    2   10645  -11.651624       FALSE      FALSE  |  10666 -11.520276
## 18   Q0B 2    3    1678   -1.757888        TRUE       TRUE  |   1657  -2.482540
##    Tmax
## 1     0
## 2     0
## 3     0
## 4     0
## 5     0
## 6     0
## 7     0
## 8     0
## 9     0
## 10    0
## 11    0
## 12    0
## 13    0
## 14    0
## 15    0
## 16    0
## 17    0
## 18    1
## 
## $results
## $results$nu_ineq
##      p0 p1 p2
## [1,] 18  2  2
## 
## $results$Tmax
##            T0       T1       T2
## [1,] -2.48254 -2.48254 -2.48254
## 
## $results$cv
##           cv0      cv1      cv2
## [1,] 2.772921 1.959964 1.959964
## 
## $results$reject_Bonf
##       [,1]  [,2]  [,3]
## [1,] FALSE FALSE FALSE
## 
## $results$pvalue_Bonf
##      [,1] [,2] [,3]
## [1,]    1    1    1
## 
## $results$pvalue_Holm
##      [,1] [,2] [,3]
## [1,]    1    1    1
## 
## $results$pvalue_BenjHochberg
##      [,1]      [,2]      [,3]
## [1,]    1 0.9999999 0.9999999
## 
## $results$pvalue_BenjYekutieli
##      [,1] [,2] [,3]
## [1,]    1    1    1

Exercise 2.b

#Artifical (local) violation of LATE assumtions
subset <- df$birthyear == 1979
effect_size <- 1*sd(df$Y)
df$Y <- df$Y + effect_size*subset*df$Z

test <- LATEtest(data=df, covars=Xvars, huge=TRUE, tree_fraction=0.05, alpha=0.05, minsize=800) # DELETE Huge here!!! and reduce minsize!!!!

maxtree <- eval(parse(text=paste("test$treelist$tree_",test$maxTtree$label,test$maxTtree$J,sep="")))
rpart.plot(maxtree,extra = 101,box.palette="GyRd",shadow.col="gray",nn=TRUE,roundint=FALSE)

test$results
## $nu_ineq
##      p0 p1 p2
## [1,] 51 12 11
## 
## $Tmax
##            T0       T1       T2
## [1,] 8.047937 8.047937 8.047937
## 
## $cv
##           cv0      cv1      cv2
## [1,] 3.096109 2.638257 2.608616
## 
## $reject_Bonf
##      [,1] [,2] [,3]
## [1,] TRUE TRUE TRUE
## 
## $pvalue_Bonf
##              [,1]         [,2]         [,3]
## [1,] 2.264855e-14 5.329071e-15 4.884981e-15
## 
## $pvalue_Holm
##              [,1]         [,2]         [,3]
## [1,] 2.264855e-14 5.329071e-15 4.884981e-15
## 
## $pvalue_BenjHochberg
##              [,1]         [,2]         [,3]
## [1,] 2.264855e-14 5.329071e-15 4.884981e-15
## 
## $pvalue_BenjYekutieli
##              [,1]         [,2]         [,3]
## [1,] 1.023446e-13 1.653723e-14 1.475204e-14