library( gbm )  
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
library( MASS )    #    used to generate correlated variables
library( dplyr )
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:MASS':
## 
##     select
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library( Hmisc )    #   used for graphing se bars
## Loading required package: grid
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
library( grid )

#source("~/perts_analyses/common/util.R")


##  HELPER FUNCTIONS

cor_vars <- function( n=100 , r=.5, mus=c(0,0) ){
  #  create the covariance matrix
  sigma   <- matrix( rep(c(1,r,r,1)), 2,2)  
  #   generate the variables
  mat     <- mvrnorm( n=n, mu=mus, sigma, empirical=T)
  return( list(mat[,1], mat[,2]) )
}


# Define Dataset Parameters
n      <- 2000 # size of sample (half will be treated)
es     <- 2    # effect size (Cohen's d) on influeced group
cov_r  <- 0    # correlation of y with covariate

# Create the moderator variable
moderator <- sample(
  x=c("Woman","Man"),
  size=n, 
  replace=TRUE
)

# simluate y and a correlated covariate
cor_list <- cor_vars( n=n, r=cov_r)
y     <- cor_list[[1]]
cov   <- cor_list[[2]]
treat <- rep(c(0,1),n/2)

d <- data.frame(
  y=y,
  cov=cov,
  moderator=moderator,
  treat=treat
)

# add moderated treatment effect (boost treated women by es)
influenced <- d$treat == 1 & d$moderator %in% c("Woman")
d$y[influenced] <- d$y[influenced] + es

aggregate( y ~ treat + moderator, data=d, mean)
##   treat moderator           y
## 1     0       Man -0.01450592
## 2     1       Man -0.01094801
## 3     0     Woman  0.01895792
## 4     1     Woman  2.00575981
#  set model parameters
CV_FOLDS <- 5
GBM_NTREES <- 5000
gbm_formula <- "y ~ treat + moderator + cov" %>% 
  as.formula()

model <- gbm(   gbm_formula ,
                ,data = d
                ,distribution = "gaussian"
                #,distribution = list(name="quantile",alpha=0.5)
                ,n.trees = GBM_NTREES
                ,shrinkage = 0.01
                ,interaction.depth = 2
                ,n.minobsinnode = 50
                ,verbose = FALSE
                ,cv.folds = CV_FOLDS
)




#   shows error in training and cv sets
best.iter <- gbm.perf(model, method="cv")

# Overall summary
summary(model, n.trees=best.iter)

##                 var   rel.inf
## treat         treat 50.187844
## moderator moderator 48.139219
## cov             cov  1.672937
# interactions
gbm:::interact.gbm(model, data=d, i.var=c("treat", "moderator"), best.iter)
## [1] 0.5498301
gbm:::interact.gbm(model, data=d, i.var=c("treat", "cov"), best.iter)
## [1] 0.0101033
gbm:::interact.gbm(model, data=d, i.var=c("moderator", "cov"), best.iter)
## [1] 0.006432768