Preliminaries: loading stuff in etc.
Slow: reading in file.
Now create some extra variables for the analysis
ddf$grade <- as.numeric(as.character(ddf$gradeAtGuessTm))
## Warning: NAs introduced by coercion
ddf <- ddf %>%
mutate(g.grade = cut(grade, breaks=c(1,6,9,13,16,25),
labels=c("elementary","middle","high","college","grad")))
ddf$test.time <- mdy_hm(as.character(ddf$guess_tm))
ddf$dob <- dmy(str_join("1_",as.character(ddf$bmonth_byear)))
ddf <- ddf %>% mutate(age.yrs = as.period(interval(dob,test.time))/years(1))
## estimate only: convert to intervals for accuracy
ddf$age <- round(ddf$age.yrs)
Here’s how much data we have in terms of number of participants by age:
amnt <- ddf %>%
group_by(age) %>%
summarise(n=length(unique(uid)))
qplot(age, n, geom="bar", data=amnt, binwidth=.5, stat="identity") +
xlab("Age (years)") +
ylab("Number of unique userids")
## Warning: position_stack requires constant width: output may be incorrect
# note, some junk here
qplot(age, n, geom="bar",
data=subset(amnt,age < 50),
binwidth=.5, stat="identity") +
xlab("Age (years)") +
ylab("Number of unique userids")
and by grade:
amnt <- ddf %>%
group_by(g.grade) %>%
summarise(n=length(unique(uid)))
qplot(g.grade, n, geom="bar", data=amnt, binwidth=.5, stat="identity") +
xlab("Grade range") +
ylab("Number of unique userids")
and in terms of number of trials:
amnt <- ddf %>%
group_by(age) %>%
summarise(n=length(uid))
qplot(age, n, geom="bar", stat="identity",
data=subset(amnt,age < 50),
binwidth=.5) +
xlab("Grade range") +
ylab("Number of unique trials")
So clearly there are many trials for some participants. Here’s the distribution of contributions. Tons of people have done ten trials, few have done 1000, one person has apparently done 40k!
amnt <- ddf %>%
group_by(age,uid) %>%
summarise(n=length(uid))
qplot(n, data=subset(amnt,age<25),
binwidth=.25) +
scale_x_log10()
Now both of those things:
qplot(n, facets=~age,
data=subset(amnt, age < 25),
binwidth=.5) +
scale_x_log10()
There is tons of data that we have neither ages nor grades for. Let’s make a somewhat ad-hoc decision to drop data for kids for whom we have less than 10 observations and for ages outside the range [8, 24].
n.obs <- ddf %>%
group_by(uid) %>%
summarise(n=length(category))
ddf <- left_join(ddf,n.obs)
## Joining by: "uid"
df <- filter(ddf, age >= 8, age < 25, n > 10, !is.na(age))
dataset <- df %>%
group_by(age) %>%
summarise(participants = length(unique(uid)),
observations = length(uid))
dataset.old <- ddf %>%
group_by(age) %>%
summarise(participants = length(unique(uid)),
observations = length(uid))
dataset
## Source: local data frame [17 x 3]
##
## age participants observations
## 1 8 15 909
## 2 9 26 2422
## 3 10 28 2057
## 4 11 44 9852
## 5 12 56 4833
## 6 13 71 8573
## 7 14 95 16160
## 8 15 70 13574
## 9 16 100 22325
## 10 17 79 13241
## 11 18 94 72712
## 12 19 95 62090
## 13 20 84 45236
## 14 21 85 52978
## 15 22 69 22649
## 16 23 56 5985
## 17 24 17 14497
dataset.old
## Source: local data frame [73 x 3]
##
## age participants observations
## 1 5 4 29
## 2 6 1 3
## 3 7 8 2331
## 4 8 25 967
## 5 9 47 2542
## 6 10 57 2209
## 7 11 89 10123
## 8 12 105 5102
## 9 13 129 8882
## 10 14 155 16506
## 11 15 106 13777
## 12 16 132 22524
## 13 17 109 13394
## 14 18 116 72839
## 15 19 129 62290
## 16 20 106 45368
## 17 21 96 53049
## 18 22 92 22781
## 19 23 66 6045
## 20 24 29 14566
## 21 25 21 4290
## 22 26 18 8338
## 23 27 17 2238
## 24 28 13 606
## 25 29 15 2216
## 26 30 11 1862
## 27 31 13 1250
## 28 32 17 1276
## 29 33 17 43535
## 30 34 13 7874
## 31 35 6 1157
## 32 36 5 3605
## 33 37 6 765
## 34 38 6 117
## 35 39 7 346
## 36 40 3 36442
## 37 41 5 688
## 38 42 3 20
## 39 43 2 14
## 40 44 4 228
## 41 45 6 718
## 42 46 3 320
## 43 47 2 29
## 44 49 2 20
## 45 50 6 1315
## 46 51 2 722
## 47 52 1 28
## 48 53 3 157
## 49 54 3 130
## 50 55 2 61
## 51 56 2 49
## 52 57 4 225
## 53 59 2 196
## 54 60 2 27
## 55 61 2 159
## 56 67 2 53
## 57 68 2 1273
## 58 69 1 16
## 59 72 1 428
## 60 73 3 21
## 61 82 1 16
## 62 85 3 382
## 63 88 1 31
## 64 89 3 1067
## 65 90 11 14331
## 66 91 5 158
## 67 92 2 255
## 68 93 1 473
## 69 94 1 40
## 70 95 1 6
## 71 96 1 160
## 72 2013 63 2285
## 73 2014 48 498
qplot(age, participants, data=dataset, geom="bar",stat="identity")
qplot(age, observations, data=dataset, geom="bar",stat="identity") +
scale_y_log10()
First cut at looking at age-related differences. Note that there seem to be a lot of puff misclassificatons for the youngest kids. For this and all subsequent analyses, we’re looking at binomial 95% confidence intervals.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 3 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 3 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
We can zoom in on this and see that there’s a lot of change in the relative classification of copepods and puffs.
qplot(age.grp, prop, col=category,
ymin=prop.l, ymax=prop.h,
geom=c("line","linerange"), group=category,
data=filter(props.t,!is.na(age.grp))) +
xlab("Estimated grade level") +
ylab("Proportion choosing") +
ylim(c(0,.2))
## Warning: Removed 8 rows containing missing values (geom_segment).
Let’s see if this has to do with the number of trials you’ve done…
df <- df %>%
group_by(uid) %>%
mutate(trial = 1:length(uid)) %>%
mutate(trial.cat = cut(trial,
c(0,5, 10, 50, 100, 500, 1000, 5000, 10000, 50000),
labels=
c("5", "10", "50", "100", "500", "1000",
"5000", "10000", "50000")))
props <- df %>%
group_by(category, age.grp, trial.cat) %>%
summarise(n=length(category))
props.t <- props %>%
group_by(age.grp, trial.cat) %>%
mutate(prop = n / sum(n),
prop.l = binom.confint(n, sum(n), method="bayes")$lower,
prop.h = binom.confint(n, sum(n), method="bayes")$upper)
## Warning: 2 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 3 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 3 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
qplot(trial.cat, prop, col=category,
ymin=prop.l, ymax=prop.h,
geom=c("line","linerange"), group=category,
data=props.t) +
facet_wrap(~age.grp) +
xlab("Exposures") +
ylab("Proportion choosing") +
ylim(c(0,1))
Yes, it does, but there’s also some other age-related structure. Let’s look at this another way:
qplot(trial.cat, prop, col=age.grp,
ymin=prop.l, ymax=prop.h, group=age.grp,
geom=c("line","linerange"),
data=props.t) +
facet_wrap(~category) +
xlab("Number of trials") +
ylab("Proportion choosing")
Only for nothing trials. Note that the young kids seem to have a hard time learning the base rate of nothing trials, even way out into the tail…
qplot(trial.cat, prop, col=age.grp,
ymin=prop.l, ymax=prop.h, group=age.grp,
geom=c("line","linerange"),
data=filter(props.t,
category=="nothing")) +
xlab("Number of trials") +
ylab("Proportion choosing")
Copepods:
qplot(trial.cat, prop, col=age.grp,
ymin=prop.l, ymax=prop.h, group=age.grp,
geom=c("line","linerange"),
data=filter(props.t,
category=="copepod")) +
xlab("Number of trials") +
ylab("Proportion choosing")
Puffs. Here again, note that the youngest kids are continuing to choose these more than they should - probably because a lot of “nothing” trials look a lot like puffs, so they are a powerful attractor. There’s also the possibility that the data far out on the tail are only representing a few participants (the elementary schoolers who were really devoted to this game).
qplot(trial.cat, prop, col=age.grp,
ymin=prop.l, ymax=prop.h, group=age.grp,
geom=c("line","linerange"),
data=filter(props.t,
category=="puff")) +
xlab("Number of trials") +
ylab("Proportion choosing")
Maybe we can fix this by estimating curves?
df$nothing <- df$category == "nothing"
df$log.trial <- log(df$trial)
mod <- glm(nothing ~ log.trial * age,
data=df)
summary(mod)
##
## Call:
## glm(formula = nothing ~ log.trial * age, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0149 0.0347 0.1044 0.1734 0.7484
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.83e-02 1.05e-02 2.7 0.0069 **
## log.trial 1.31e-01 1.73e-03 76.2 <2e-16 ***
## age 2.79e-02 5.72e-04 48.8 <2e-16 ***
## log.trial:age -4.64e-03 9.14e-05 -50.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1147)
##
## Null deviance: 46327 on 370092 degrees of freedom
## Residual deviance: 42447 on 370089 degrees of freedom
## AIC: 248849
##
## Number of Fisher Scoring iterations: 2
df$pred <- predict(mod)
Now actually plot this onto the same axes. The logistic fit is not really that great for a number of the age groups. This is interesting, but I’m not sure if it’s reflective of the actual shape of the data or only odd participant effects because different participant groups contributed different amounts of data.
props <- df %>%
group_by(trial.cat, category, age.grp) %>%
summarise(n=length(category),
pred = mean(pred))
props.t <- props %>%
group_by(age.grp, trial.cat) %>%
mutate(prop = n / sum(n),
pred = mean(pred),
prop.l = binom.confint(n, sum(n), method="bayes")$lower,
prop.h = binom.confint(n, sum(n), method="bayes")$upper)
## Warning: 2 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 3 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 3 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
## Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
## Try changing 'tol' to a different value.
qplot(trial.cat, prop, col=age.grp,
geom=c("line","pointrange"), group=age.grp,
ymin=prop.l, ymax=prop.h,
data=filter(props.t,category=="nothing")) +
geom_line(aes(x=trial.cat, y=pred),lty=2) +
xlab("Exposures") +
facet_wrap(~age.grp) +
ylab("Proportion choosing") +
ylim(c(0,1))
Perhaps a mixed model would fix this, but the inference is quite slow so I haven’t done it yet.
Read in and merge. Not as slow as it could be.
r <- read.csv("~/Projects/concept_net/data/whoi_ringers.csv")
rf <- tbl_df(r)
names(rf)[1] <- "true.category"
df <- left_join(df, rf, by="fname")
df$correct <- df$category == df$true.category
Now do analyses on correctness.
prop.corr <- df %>%
group_by(trial.cat, category, age.grp) %>%
summarise(corr = mean(correct, na.rm=TRUE))
qplot(trial.cat, corr, col=category, group=category,
facets=~age.grp,
geom="line",
data=prop.corr) +
ylim(c(0,1)) +
ylab("Proportion correct") +
xlab("Number of trials completed")
Not much in the way of learning effects plotted that way. Let’s do it by age/category since that was useful last time.
qplot(trial.cat, corr, col=age.grp, group=age.grp,
facets=~category,
geom="line",
data=prop.corr) +
ylim(c(0,1)) +
ylab("Proportion correct") +
xlab("Number of trials completed")
Now aggregate by age since there’s not much going on there?
prop.corr <- df %>%
group_by(trial.cat, category) %>%
summarise(corr = mean(correct, na.rm=TRUE))
qplot(trial.cat, corr, col=category, group=category,
geom="line",
data=prop.corr) +
ylim(c(0,1)) +
ylab("Proportion correct") +
xlab("Number of trials completed")