library( gbm )
## Loading required package: survival
## Loading required package: lattice
## Loading required package: splines
## Loading required package: parallel
library( MASS ) # used to generate correlated variables
library( dplyr )
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:MASS':
##
## select
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library( Hmisc ) # used for graphing se bars
## Loading required package: grid
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## combine, src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
library( grid )
#source("~/perts_analyses/common/util.R")
## HELPER FUNCTIONS
cor_vars <- function( n=100 , r=.5, mus=c(0,0) ){
# create the covariance matrix
sigma <- matrix( rep(c(1,r,r,1)), 2,2)
# generate the variables
mat <- mvrnorm( n=n, mu=mus, sigma, empirical=T)
return( list(mat[,1], mat[,2]) )
}
# Define Dataset Parameters
n <- 2000 # size of sample (half will be treated)
es <- 2 # effect size (Cohen's d) on influeced group
cov_r <- 0 # correlation of y with covariate
# Create the moderator variable
moderator <- sample(
x=c("Woman","Man"),
size=n,
replace=TRUE
)
# simluate y and a correlated covariate
cor_list <- cor_vars( n=n, r=cov_r)
y <- cor_list[[1]]
cov <- cor_list[[2]]
treat <- rep(c(0,1),n/2)
d <- data.frame(
y=y,
cov=cov,
moderator=moderator,
treat=treat
)
# add moderated treatment effect (boost treated women by es)
influenced <- d$treat == 1 & d$moderator %in% c("Woman")
d$y[influenced] <- d$y[influenced] + es
aggregate( y ~ treat + moderator, data=d, mean)
## treat moderator y
## 1 0 Man 0.023751453
## 2 1 Man 0.022304150
## 3 0 Woman -0.006884881
## 4 1 Woman 1.962107398
Here I will explicitly add the cross between moderator and treatment (instead of using a formula to do so).
d$cross = factor(paste(d$moderator, d$treat))
# set model parameters
CV_FOLDS <- 5
GBM_NTREES <- 5000
gbm_formula <- "y ~ treat + moderator + cov + cross" %>%
as.formula()
model <- gbm( gbm_formula ,
,data = d
,distribution = "gaussian"
#,distribution = list(name="quantile",alpha=0.5)
,n.trees = GBM_NTREES
,shrinkage = 0.01
,interaction.depth = 1
,n.minobsinnode = 50
,verbose = FALSE
,cv.folds = CV_FOLDS
)
# shows error in training and cv sets
best.iter <- gbm.perf(model, method="cv")
d$yhat <- predict( model, n.trees=best.iter, d )
# what is the relative influence of each variable?
summary(model,n.trees=best.iter)
## var rel.inf
## cross cross 97.21786144
## cov cov 2.59978512
## treat treat 0.12198445
## moderator moderator 0.06036899