Preliminaries.
rm(list=ls())
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
library(binom)
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following object is masked from 'package:tidyr':
##
## expand
library(bootstrap)
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:tidyr':
##
## extract
library(stringr)
## for bootstrapping 95% confidence intervals
theta <- function(x,xdata,na.rm=T) {mean(xdata[x],na.rm=na.rm)}
ci.low <- function(x,na.rm=T) {
mean(x,na.rm=na.rm) - quantile(bootstrap(1:length(x),1000,theta,x,na.rm=na.rm)$thetastar,.025,na.rm=na.rm)}
ci.high <- function(x,na.rm=T) {
quantile(bootstrap(1:length(x),1000,theta,x,na.rm=na.rm)$thetastar,.975,na.rm=na.rm) - mean(x,na.rm=na.rm)}
theme_set(theme_bw())
Load data.
d1 <- read.csv("../data/experiment_1.tsv", header=TRUE,
sep="\t", row.names=NULL, stringsAsFactors = FALSE)
d2 <- read.csv("../data/experiment_2.tsv", header=TRUE,
sep="\t", row.names=NULL, stringsAsFactors = FALSE)
d3 <- read.csv("../data/experiment_3.tsv", header=TRUE,
sep="\t", row.names=NULL, stringsAsFactors = FALSE)
rd <- bind_rows(d1, d2, d3)
Constants:
questions <- c("sr","sh","sp","rs","rh","rp","hs","hr","hp","ps","pr","ph")
correct_answers <- c(1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0)
answer_key <- 1 - correct_answers ## a yes answer is coded as "0" in the data
related <- list(square = which(grepl("s", questions)),
rectangle = which(grepl("r", questions)),
rhombus = which(grepl("h", questions)),
parallel = which(grepl("p", questions)))
d <- rd %>%
select(workerid, starts_with("Answer")) %>%
rename(pre = Answer.pretest_responses_by_presented_order,
post = Answer.posttest_responses_by_presented_order,
ord = Answer.permutations_used,
time = Answer.training_time,
shape = Answer.shape_of_focus,
cond = Answer.training_regime) %>%
select(-starts_with("Answer")) %>%
rowwise %>%
do(data.frame(workerid = .$workerid,
pre = as.integer(strsplit(substr(.$pre, 2, 24), ",")[[1]]),
post = as.integer(strsplit(substr(.$post, 2, 24), ",")[[1]]),
time = .$time,
# note zero indexing of orders, that's why +1
qnum = as.integer(strsplit(substr(.$ord, 2, 26), ",")[[1]]) + 1,
cond = .$cond,
shape = .$shape)) %>%
gather(phase, resp, pre, post) %>%
mutate(cond = factor(cond,
levels = c(3,4,5),
labels = c("active", "teaching", "baseline")),
shape = factor(shape,
levels = c(0, 1, 2, 3),
labels = c("square","rectangle","rhombus","parallel")),
question = questions[qnum],
answer = answer_key[qnum],
correct = resp == answer) %>%
rowwise() %>%
mutate(relevant = qnum %in% related[shape][[1]])
## Warning in rbind_all(out[[1]]): Unequal factor levels: coercing to
## character
Between subjects
ms <- d %>%
group_by(cond, shape, phase, question, relevant) %>%
summarise(m.cih = ci.high(correct),
m.cil = ci.low(correct),
m = mean(correct))
## Warning: Grouping rowwise data frame strips rowwise nature
qplot(question, m, fill = phase,
ymin = m - m.cil, ymax = m + m.cih,
position = position_dodge(width = .9),
alpha = relevant,
facets = shape ~ cond,
data = ms,
geom = c("bar", "linerange"), stat = "identity")
Within subjects change scores
ms <- d %>%
select(-resp) %>%
spread(phase, correct) %>%
group_by(cond, shape, question, relevant) %>%
summarise(m.cih = ci.high(post - pre),
m.cil = ci.low(post - pre),
m = mean(post - pre))
qplot(question, m, fill = relevant,
ymin = m - m.cil, ymax = m + m.cih,
position = position_dodge(width = .9),
facets = shape ~ cond,
data = ms,
geom = c("bar", "linerange"), stat = "identity")
Within subjects but aggregated across questions
ms <- d %>%
select(-resp) %>%
spread(phase, correct) %>%
group_by(workerid, cond, shape, relevant) %>%
summarise(m = mean(post - pre)) %>%
group_by(cond, shape, relevant) %>%
summarise(m.cih = ci.high(m),
m.cil = ci.low(m),
m = mean(m),
n = n())
ms
## Source: local data frame [14 x 7]
## Groups: cond, shape
##
## cond shape relevant m.cih m.cil m n
## 1 active rectangle FALSE 0.03794192 0.03787879 -1.767677e-02 66
## 2 active rectangle TRUE 0.03787879 0.04040404 -5.050505e-03 66
## 3 active rhombus FALSE 0.03340278 0.03062500 1.388889e-02 60
## 4 active rhombus TRUE 0.05277778 0.05000000 -2.777778e-03 60
## 5 active parallel FALSE 0.03961864 0.03954802 -1.129944e-02 59
## 6 active parallel TRUE 0.04802260 0.04802260 2.824859e-02 59
## 7 teaching rectangle FALSE 0.05208333 0.05208333 -3.472222e-03 48
## 8 teaching rectangle TRUE 0.04861111 0.05208333 2.777778e-02 48
## 9 teaching rhombus FALSE 0.02777778 0.02777778 1.010101e-02 66
## 10 teaching rhombus TRUE 0.04927399 0.05050505 5.176768e-02 66
## 11 teaching parallel FALSE 0.04232804 0.03968254 2.910053e-02 63
## 12 teaching parallel TRUE 0.05026455 0.05291005 2.910053e-02 63
## 13 baseline rhombus FALSE 0.06306306 0.05405405 4.954955e-02 37
## 14 baseline rhombus TRUE 0.06306306 0.05405405 7.445277e-19 37
qplot(cond, m, fill = relevant,
ymin = m - m.cil, ymax = m + m.cih,
position = position_dodge(width = .9),
facets = . ~ shape,
data = ms,
geom = c("bar", "linerange"), stat = "identity")
Basic analysis over all questions - pretest only. Sanity check that we are getting reasonable answers?
ms <- d %>%
filter(phase == "pre") %>%
group_by(workerid, question) %>%
summarise(m = mean(1 - resp)) %>%
group_by(question) %>%
summarise(m.cih = ci.high(m),
m.cil = ci.low(m),
m = mean(m))
qplot(question, m,
ymin = m - m.cil, ymax = m + m.cih,
position = position_dodge(width = .9),
data = ms,
geom = c("bar", "linerange"), stat = "identity") +
geom_hline(yintercept = .5, lty = 2)