# load data
setwd('../dashboard/')
source('global.R')
## [1] "../data/infant_directed_speech_preference.csv"
## [1] "../data/label_advantage.csv"
## [1] "../data/mutual_exclusivity.csv"
## [1] "../data/phonemic_discrimination_native.csv"
## [1] "../data/phonemic_discrimination_nonnative.csv"
## [1] "../data/transitive_alternation.csv"
## [1] "../data/word_segmentation.csv"
We currently have 6 datasets included: IDS, label advantage, mutual exclusivity, phonemic discrimination native, phonemic discrimination non-native, and word segmentation.
all = all_data %>%
mutate(n_total = ifelse(!is.na(n_2), n_1 + n_2, n_1)) %>%
select(dataset, d_calc,
mean_age_1, n_total,
n_excluded, num_trials, method)
ggplot(all, aes(x = mean_age_1, y = n_total, color = dataset)) +
geom_point() +
geom_smooth(method = "lm") +
facet_wrap(~dataset, scales = "free") +
theme_bw() +
theme(legend.position="none")
ns_ds = all_data %>%
mutate(n_total = ifelse(!is.na(n_2), n_1 + n_2, n_1)) %>%
select(dataset, d_calc,
mean_age_1, n_total)
ggplot(ns_ds , aes(y = n_total, x = d_calc, color = dataset)) +
geom_point() +
geom_smooth(method = "lm") +
facet_wrap(~dataset, scales = "free") +
theme_bw() +
xlab("Effect size") +
theme(legend.position="none")
Some issues:
exclusions = all_data %>%
filter(!is.na(n_excluded_1) | !is.na(n_excluded)) %>%
mutate(n_excluded = ifelse(is.na(n_excluded), n_excluded_1,
n_excluded),
n_total = ifelse(!is.na(n_2), n_1 + n_2, n_1)) %>%
select(dataset, d_calc,
mean_age_1, n_total,
n_excluded, num_trials, method) %>%
mutate(prop_excluded = n_excluded/n_total) %>%
filter(prop_excluded < 1)
ggplot(exclusions, aes(x = prop_excluded, y =d_calc ,
color = dataset)) +
geom_point() +
geom_smooth(method = "lm") +
ggtitle("prop. excluded as a function of age") +
xlab("proportion excluded") +
ylab("d") +
theme_bw()
Looks like maybe increases.
ggplot(exclusions, aes(x = mean_age_1, y = prop_excluded,
color = dataset)) +
geom_point() +
geom_smooth(method = "lm") +
ggtitle("prop. excluded as a function of age") +
xlab("mean age (months)") +
ylab("proportion excluded") +
theme_bw()
years.df = all_data %>%
mutate(year = as.numeric(unlist(lapply(strsplit(unlist(all_data$unique_ID), "[^0-9]+"), function(x) unlist(x)[2]))))
#years.sig = rep('', length(unique(years.df$dataset)))
#for (i in 1:length(unique(years.df$dataset))) {
# my_data = all_data[all_data$dataset == datasets[i],]
# m = rma(d ~ year + mean_age_1, vi = d_var, slab = as.character(unique_ID), data = #my_data, method = "REML")
# years.sig[i] = ifelse(m$pval[2] <.05, "*", years.sig)
#}
#ann_text <- data.frame(sig = years.sig, lab = "Text",
# df = unique(years.df$dataset))
ggplot(years.df , aes(x = year, y = d_calc, color = dataset)) +
geom_point() +
#scale_colour_gradientn(colours = topo.colors(10)) +
geom_smooth(method = "lm") +
facet_wrap(~ dataset, scales = "free_y") +
theme_bw() +
#geom_text(data = ann_text,aes(label = sig), x = 2000, y = 1) +
xlab("published year") +
theme(legend.position="none")
rma(d_calc ~ year + mean_age_1 + dataset, vi = d_var_calc, data = years.df, method = "REML")
##
## Mixed-Effects Model (k = 642; tau^2 estimator: REML)
##
## tau^2 (estimated amount of residual heterogeneity): 0.1920 (SE = 0.0146)
## tau (square root of estimated tau^2 value): 0.4382
## I^2 (residual heterogeneity / unaccounted variability): 80.99%
## H^2 (unaccounted variability / sampling variability): 5.26
## R^2 (amount of heterogeneity accounted for): 18.91%
##
## Test for Residual Heterogeneity:
## QE(df = 634) = 2655.4436, p-val < .0001
##
## Test of Moderators (coefficient(s) 2,3,4,5,6,7,8):
## QM(df = 7) = 141.3138, p-val < .0001
##
## Model Results:
##
## estimate se zval
## intrcpt 23.6517 6.9990 3.3793
## year -0.0116 0.0035 -3.2979
## mean_age_1 0.0010 0.0002 6.2815
## datasetLabel advantage in concept learning -0.3898 0.1046 -3.7248
## datasetMutual exclusivity -0.4635 0.1490 -3.1101
## datasetPhonemic discrimination (native) -0.0855 0.1038 -0.8239
## datasetPhonemic discrimination (non-native) -0.0918 0.1206 -0.7612
## datasetWord segmentation -0.4844 0.0954 -5.0776
## pval ci.lb ci.ub
## intrcpt 0.0007 9.9339 37.3696 ***
## year 0.0010 -0.0184 -0.0047 ***
## mean_age_1 <.0001 0.0007 0.0013 ***
## datasetLabel advantage in concept learning 0.0002 -0.5949 -0.1847 ***
## datasetMutual exclusivity 0.0019 -0.7556 -0.1714 **
## datasetPhonemic discrimination (native) 0.4100 -0.2889 0.1179
## datasetPhonemic discrimination (non-native) 0.4466 -0.3281 0.1445
## datasetWord segmentation <.0001 -0.6714 -0.2974 ***
##
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(years.df , aes(x = year, y = mean_age_1, color = dataset)) +
geom_point() +
geom_smooth(method = "lm") +
facet_wrap(~ dataset, scales = "free_y") +
theme_bw() +
#geom_text(data = ann_text,aes(label = sig), x = 2000, y = 1) +
xlab("published year") +
theme(legend.position="none")
Here we’re intersted in describing the extent to which papers report all statistics that are desirable from the perspective of meta-analysis (i.e. t/F, M, SD, d)
counts = all_data %>%
summarise(test_statistic = sum(!is.na(t) | !is.na(F) | !is.na(r)),
means = sum(!is.na(x_1)),
SD = sum(!is.na(SD_1)),
d = sum(!is.na(d_calc)),
g = sum(!is.na(g_calc)),
r = sum(!is.na(r_calc)),
age_range = sum(!is.na(age_range_1)),
gender = sum(!is.na(gender_1))) %>%
gather("coded_variable", "n") %>%
mutate(coded = "coded") %>%
mutate(total = nrow(all_data)) %>%
mutate(coded_variable = factor(coded_variable, levels = c("d", "g", "r", "means",
"SD", "test_statistic",
"age_range", "gender")))
counts = counts %>%
mutate(n = total - n,
coded = "uncoded") %>%
bind_rows(counts) %>%
mutate(n_lab = ifelse(coded == "coded", n, "")) %>%
arrange(coded)
ggplot(counts) +
geom_bar(aes(x = coded_variable,
y = n/total,
fill = coded,
order = coded),
stat = "identity") +
ylim(0,1) +
ylab("Number coded") +
xlab("Coded variable") +
ggtitle("All data") +
annotate("text", x = 1, y = .9,
label = paste("N =", counts$total[1]), size = 6) +
scale_fill_manual(values=c( "lightgreen", "grey")) +
geom_text(aes(label = n_lab, x = coded_variable, y = n/total -.06) )+
theme_bw() +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
text = element_text(size=20))
By MA
counts = all_data %>%
group_by(dataset) %>%
summarise(test_statistic = sum(!is.na(t) | !is.na(F)),
means = sum(!is.na(x_1)),
SD = sum(!is.na(SD_1)),
d = sum(!is.na(d_calc)),
g = sum(!is.na(g_calc)),
r = sum(!is.na(r_calc)),
age_range = sum(!is.na(age_range_1)),
gender = sum(!is.na(gender_1)),
total = n()) %>%
gather(coded_variable, n, -dataset, -total) %>%
mutate(coded = "coded") %>%
mutate(coded_variable = factor(coded_variable, levels = c("d", "g", "r", "means",
"SD", "test_statistic",
"age_range", "gender")))
counts = counts %>%
mutate(n = total - n,
coded = "uncoded") %>%
bind_rows(counts) %>%
mutate(n_lab = ifelse(coded == "coded", n, "")) %>%
arrange(coded)
ggplot(counts, aes(fill = coded)) +
geom_bar(aes(x = factor(coded_variable),
y = n/total), ## FIX THIS
stat = "identity",
position = "fill") +
facet_wrap(~dataset, nrow=3, ncol=2) +
ylim(0,1) +
ylab("Number coded") +
xlab("Coded variable") +
scale_fill_manual(values = c("lightgreen", "grey")) +
geom_text(aes(label = n_lab,
x = coded_variable,
y = n/total -.06)) +
theme_bw() +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(colour = "black"),
text = element_text(size=20),
strip.background = element_blank(),
axis.text.x = element_text(angle = 30, hjust = 1))
#annotate("text", x = 1, y = .9,
# label = paste("N =", totals), size = 6) +
#totals_text = all_data %>%
# group_by(dataset) %>%
# summarise(total = n())%>%
# mutate(coded_variable = 1,
# lab = paste("N =", total))
Here I calculated the power to detect an effect using a random effect model (Hedges & Pigott, 2001; Valentine, Pigott, & Rothstein, 2010). Note that this information is redundant with confidence intervals on the estimated effect size, and the power in all of these MAs is essentially 1. Below, I ran a simulation sampling n papers from each meta-analysis, and calculating power. We need surprisingly few studies to get relatively high power.
calculate_ma_power <- function(sample_data) {
ALPHA <- .05
NTAILS <- 2
c_crit <- qnorm(ALPHA/NTAILS, lower.tail = F)
model <- rma(d_calc, vi = d_var_calc, data = sample_data,
method = "REML", control = list(maxiter = 1000, stepadj = 0.5))
lambda <- model$b[1] / model$se[1] # Valentine, Eq. [6]
power <- 1 - pnorm(c_crit - lambda)
power
}
one_sample <- function(ma_data, summary_function, n) {
function(k) {
do.call(summary_function, list(sample_n(ma_data, size = n, replace = F)))
}
}
all_samples <- function(ma_data, summary_function, n, nboot) {
sample_values <- 1:nboot %>%
map(one_sample(ma_data, summary_function, n)) %>%
unlist()
data.frame(n = n,
mean = mean(sample_values),
ci_lower = ci_lower(sample_values),
ci_upper = ci_upper(sample_values),
row.names = NULL)
}
dataset_data <- function(ma_data, summary_function, nboot) {
overall <- do.call(summary_function, list(ma_data))
seq(1, nrow(ma_data), by = 1) %>%
map(function(n) all_samples(ma_data,
summary_function, n, nboot)) %>%
bind_rows() %>%
mutate(dataset = unique(ma_data$dataset),
overall_est = overall)
}
power_data <- all_data %>%
split(.$dataset) %>%
map(function(ma_data) dataset_data(ma_data,
calculate_ma_power, 50)) %>%
bind_rows()
ggplot(power_data, aes(x = n, y = mean, fill = dataset)) +
geom_ribbon(aes(ymin = ci_lower, ymax = ci_upper), alpha = 0.5) +
geom_hline(aes(yintercept = overall_est), linetype = "dashed") +
geom_line( aes(x = n, y = mean)) +
facet_wrap(~dataset) +
xlab("Number of effect sizes") +
ylab("Meta-analytic power") +
xlim(0,150) +
scale_fill_solarized(guide = FALSE) +
theme_bw() +
theme(text = element_text(family = "Open Sans"),
panel.grid.minor = element_blank())
Not totally sure how to do this. Valentine et al. (2001) say it’s difficult without the raw data?
Corrects for selective reporting on the basis of significance (fewer insignificant results; Simonsohn, Nelson, & Simmons, 2014)
# calculate degrees of freedom based on Ns and design (within = n_1-1, between = n_1 + n_2 - 2
pcurve.data =
select(all_data, dataset, unique_ID, t, F, n_1, n_2, participant_design) %>%
filter(!is.na(t)|!is.na(F)) %>%
mutate(df = ifelse(participant_design == "between", n_1 + n_2-2, n_1-1))
#summary(pcurve.data$df)
#pcurve.df %>%
# filter(participant_design == "between") %>%
# data.frame()
# code from Simonsohn, Nelson, & Simmons (2014) appendix
loss = function(t_obs,df_obs,d_est) {
t_obs=abs(t_obs)
p_obs=2*(1-pt(t_obs,df=df_obs))
t.sig=subset(t_obs,p_obs<.05)
df.sig=subset(df_obs,p_obs<.05)
ncp_est=sqrt((df.sig+2)/4)*d_est
tc=qt(.975,df.sig)
power_est=1-pt(tc,df.sig,ncp_est)
p_larger=pt(t.sig,df=df.sig,ncp=ncp_est)
ppr=(p_larger-(1-power_est))/power_est
KSD=ks.test(ppr,punif)$statistic
return(KSD)
}
plotloss=function(t_obs,df_obs,dmin,dmax) {
loss.all=c()
di=c()
for (i in 0:((dmax-dmin)*100)) {
d = dmin + i/100
di = c(di,d)
options(warn = -1)
loss.all=c(loss.all,loss(df_obs=df_obs,t_obs=t_obs,d_est=d))
options(warn = 0)
}
imin=match(min(loss.all),loss.all)
dstart=dmin+imin/100
dhat=optimize(loss,c(dstart-.1, dstart+.1), df_obs=df_obs,t_obs=t_obs)
plot(di,
loss.all, xlab="Effect size\nCohen-d",
ylab="Loss (D stat in KS test)",
ylim=c(0,1),
main="How well does each effect size fit? (lower is better)")
points(dhat$minimum,dhat$objective,pch=19,col="red",cex=2)
text(dhat$minimum,dhat$objective-.08,
paste0("p-curve’s estimate of effect size:\nd=",round(dhat$minimum,3)),col="red")
return(dhat$minimum)
}
#Example
#t_obs= c(1.7, 2.8, -3.1, 2.4) # include one p > .05 and one negative t-value to highlight how we treat those
#df_obs=c(44, 75, 125, 200)
#plotloss(t_obs=t_obs,df_obs=df_obs,dmin=-1,dmax=1)
Corrects for selective reporting on the basis of effect size (fewer small effects; Duval & Tweedie, 2000a, 2000b)
### fit random-effects model and plot
for (i in 1:length(datasets)) {
my_data = all_data[all_data$dataset == datasets[i],]
res <- rma(d, d_var, data = my_data)
taf <- trimfill(res)
funnel(taf, main = my_data$dataset[1])
}
res <- rma(d, d_var, data = all_data)
taf <- trimfill(res)
funnel(taf, main = "all data")
#regtest