EXPLORATION OF DYER PLANKTON DATA FROM WHYVILLE

Preliminaries: loading stuff in etc.

Slow: reading in file.

Now create some extra variables for the analysis

ddf$grade <- as.numeric(as.character(ddf$gradeAtGuessTm))
## Warning: NAs introduced by coercion
ddf <- ddf %>% 
  mutate(g.grade = cut(grade, breaks=c(1,6,9,13,16,25),
                       labels=c("elementary","middle","high","college","grad")))

ddf$test.time <- mdy_hm(as.character(ddf$guess_tm))
ddf$dob <- dmy(str_join("1_",as.character(ddf$bmonth_byear)))
ddf <- ddf %>% mutate(age.yrs = as.period(interval(dob,test.time))/years(1))
## estimate only: convert to intervals for accuracy
ddf$age <- round(ddf$age.yrs)

Analysis of amount of data

Here’s how much data we have in terms of number of participants by age:

amnt <- ddf %>% 
  group_by(age) %>%
  summarise(n=length(unique(uid)))

qplot(age, n, geom="bar", data=amnt, binwidth=.5, stat="identity") + 
  xlab("Age (years)") + 
  ylab("Number of unique userids")
## Warning: position_stack requires constant width: output may be incorrect

plot of chunk amt1

# note, some junk here
qplot(age, n, geom="bar", 
      data=subset(amnt,age < 50), 
      binwidth=.5, stat="identity") + 
  xlab("Age (years)") + 
  ylab("Number of unique userids")

plot of chunk amt1

and by grade:

amnt <- ddf %>% 
  group_by(g.grade) %>%
  summarise(n=length(unique(uid)))

qplot(g.grade, n, geom="bar", data=amnt, binwidth=.5, stat="identity") + 
  xlab("Grade range") + 
  ylab("Number of unique userids")

plot of chunk amt1a

and in terms of number of trials:

amnt <- ddf %>% 
  group_by(age) %>%
  summarise(n=length(uid))

qplot(age, n, geom="bar", stat="identity",
      data=subset(amnt,age < 50), 
                  binwidth=.5) + 
  xlab("Grade range") + 
  ylab("Number of unique trials")

plot of chunk amt1b

So clearly there are many trials for some participants. Here’s the distribution of contributions. Tons of people have done ten trials, few have done 1000, one person has apparently done 40k!

amnt <- ddf %>% 
  group_by(age,uid) %>%
  summarise(n=length(uid))

qplot(n, data=subset(amnt,age<25), 
      binwidth=.25) + 
  scale_x_log10()

plot of chunk amt2

Now both of those things:

qplot(n, facets=~age, 
      data=subset(amnt, age < 25), 
      binwidth=.5) + 
  scale_x_log10()

plot of chunk amt3

There is tons of data that we have neither ages nor grades for. Let’s make a somewhat ad-hoc decision to drop data for kids for whom we have less than 10 observations and for ages outside the range [8, 24].

n.obs <- ddf %>%
  group_by(uid) %>% 
  summarise(n=length(category))

ddf <- left_join(ddf,n.obs)
## Joining by: "uid"
df <- filter(ddf, age >= 8, age < 25, n > 10, !is.na(age))

dataset <- df %>%
  group_by(age) %>% 
  summarise(participants = length(unique(uid)),
            observations = length(uid))


dataset.old <- ddf %>%
  group_by(age) %>% 
  summarise(participants = length(unique(uid)),
            observations = length(uid))
dataset
## Source: local data frame [17 x 3]
## 
##    age participants observations
## 1    8           15          909
## 2    9           26         2422
## 3   10           28         2057
## 4   11           44         9852
## 5   12           56         4833
## 6   13           71         8573
## 7   14           95        16160
## 8   15           70        13574
## 9   16          100        22325
## 10  17           79        13241
## 11  18           94        72712
## 12  19           95        62090
## 13  20           84        45236
## 14  21           85        52978
## 15  22           69        22649
## 16  23           56         5985
## 17  24           17        14497
dataset.old
## Source: local data frame [73 x 3]
## 
##     age participants observations
## 1     5            4           29
## 2     6            1            3
## 3     7            8         2331
## 4     8           25          967
## 5     9           47         2542
## 6    10           57         2209
## 7    11           89        10123
## 8    12          105         5102
## 9    13          129         8882
## 10   14          155        16506
## 11   15          106        13777
## 12   16          132        22524
## 13   17          109        13394
## 14   18          116        72839
## 15   19          129        62290
## 16   20          106        45368
## 17   21           96        53049
## 18   22           92        22781
## 19   23           66         6045
## 20   24           29        14566
## 21   25           21         4290
## 22   26           18         8338
## 23   27           17         2238
## 24   28           13          606
## 25   29           15         2216
## 26   30           11         1862
## 27   31           13         1250
## 28   32           17         1276
## 29   33           17        43535
## 30   34           13         7874
## 31   35            6         1157
## 32   36            5         3605
## 33   37            6          765
## 34   38            6          117
## 35   39            7          346
## 36   40            3        36442
## 37   41            5          688
## 38   42            3           20
## 39   43            2           14
## 40   44            4          228
## 41   45            6          718
## 42   46            3          320
## 43   47            2           29
## 44   49            2           20
## 45   50            6         1315
## 46   51            2          722
## 47   52            1           28
## 48   53            3          157
## 49   54            3          130
## 50   55            2           61
## 51   56            2           49
## 52   57            4          225
## 53   59            2          196
## 54   60            2           27
## 55   61            2          159
## 56   67            2           53
## 57   68            2         1273
## 58   69            1           16
## 59   72            1          428
## 60   73            3           21
## 61   82            1           16
## 62   85            3          382
## 63   88            1           31
## 64   89            3         1067
## 65   90           11        14331
## 66   91            5          158
## 67   92            2          255
## 68   93            1          473
## 69   94            1           40
## 70   95            1            6
## 71   96            1          160
## 72 2013           63         2285
## 73 2014           48          498
qplot(age, participants, data=dataset, geom="bar",stat="identity")

plot of chunk drops

qplot(age, observations, data=dataset, geom="bar",stat="identity") + 
  scale_y_log10()

plot of chunk drops

Analysis of categories by age

First cut at looking at age-related differences. Note that there seem to be a lot of puff misclassificatons for the youngest kids. For this and all subsequent analyses, we’re looking at binomial 95% confidence intervals.

## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 3 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 3 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.

plot of chunk categories

We can zoom in on this and see that there’s a lot of change in the relative classification of copepods and puffs.

qplot(age.grp, prop, col=category,
      ymin=prop.l, ymax=prop.h,
      geom=c("line","linerange"), group=category,
      data=filter(props.t,!is.na(age.grp))) + 
  xlab("Estimated grade level") + 
  ylab("Proportion choosing") + 
  ylim(c(0,.2))
## Warning: Removed 8 rows containing missing values (geom_segment).

plot of chunk categories2

Let’s see if this has to do with the number of trials you’ve done…

df <- df %>% 
  group_by(uid) %>%
  mutate(trial = 1:length(uid)) %>%
  mutate(trial.cat = cut(trial, 
                         c(0,5, 10, 50, 100, 500, 1000, 5000, 10000, 50000),
                         labels=
                           c("5", "10", "50", "100", "500", "1000", 
                             "5000", "10000", "50000")))

props <- df %>%
  group_by(category, age.grp, trial.cat) %>%
  summarise(n=length(category))

props.t <- props %>%
  group_by(age.grp, trial.cat) %>%
  mutate(prop = n / sum(n),
         prop.l = binom.confint(n, sum(n), method="bayes")$lower,
         prop.h = binom.confint(n, sum(n), method="bayes")$upper)
## Warning: 2 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 3 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 3 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
qplot(trial.cat, prop, col=category,
      ymin=prop.l, ymax=prop.h,
      geom=c("line","linerange"), group=category,
      data=props.t) + 
  facet_wrap(~age.grp) +
  xlab("Exposures") + 
  ylab("Proportion choosing") + 
  ylim(c(0,1))

plot of chunk categories3

Yes, it does, but there’s also some other age-related structure. Let’s look at this another way:

qplot(trial.cat, prop, col=age.grp,
      ymin=prop.l, ymax=prop.h, group=age.grp,
      geom=c("line","linerange"), 
      data=props.t) + 
  facet_wrap(~category) +
  xlab("Number of trials") +
  ylab("Proportion choosing") 

plot of chunk categories4

Only for nothing trials. Note that the young kids seem to have a hard time learning the base rate of nothing trials, even way out into the tail…

qplot(trial.cat, prop, col=age.grp,
      ymin=prop.l, ymax=prop.h, group=age.grp,
      geom=c("line","linerange"), 
      data=filter(props.t,
                  category=="nothing")) + 
  xlab("Number of trials") + 
  ylab("Proportion choosing") 

plot of chunk categories5

Copepods:

qplot(trial.cat, prop, col=age.grp,
      ymin=prop.l, ymax=prop.h, group=age.grp,
      geom=c("line","linerange"), 
      data=filter(props.t,
                  category=="copepod")) + 
  xlab("Number of trials") + 
  ylab("Proportion choosing")

plot of chunk categories6

Puffs. Here again, note that the youngest kids are continuing to choose these more than they should - probably because a lot of “nothing” trials look a lot like puffs, so they are a powerful attractor. There’s also the possibility that the data far out on the tail are only representing a few participants (the elementary schoolers who were really devoted to this game).

qplot(trial.cat, prop, col=age.grp,
      ymin=prop.l, ymax=prop.h, group=age.grp,
      geom=c("line","linerange"), 
      data=filter(props.t,
                  category=="puff")) + 
  xlab("Number of trials") + 
  ylab("Proportion choosing") 

plot of chunk categories7

Maybe we can fix this by estimating curves?

df$nothing <- df$category == "nothing"
df$log.trial <- log(df$trial)
mod <- glm(nothing ~ log.trial * age,
             data=df)
summary(mod)
## 
## Call:
## glm(formula = nothing ~ log.trial * age, data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0149   0.0347   0.1044   0.1734   0.7484  
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    2.83e-02   1.05e-02     2.7   0.0069 ** 
## log.trial      1.31e-01   1.73e-03    76.2   <2e-16 ***
## age            2.79e-02   5.72e-04    48.8   <2e-16 ***
## log.trial:age -4.64e-03   9.14e-05   -50.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.1147)
## 
##     Null deviance: 46327  on 370092  degrees of freedom
## Residual deviance: 42447  on 370089  degrees of freedom
## AIC: 248849
## 
## Number of Fisher Scoring iterations: 2
df$pred <- predict(mod)

Now actually plot this onto the same axes. The logistic fit is not really that great for a number of the age groups. This is interesting, but I’m not sure if it’s reflective of the actual shape of the data or only odd participant effects because different participant groups contributed different amounts of data.

props <- df %>% 
  group_by(trial.cat, category, age.grp) %>%
  summarise(n=length(category),
            pred = mean(pred))

props.t <- props %>%
  group_by(age.grp, trial.cat) %>%
  mutate(prop = n / sum(n),
         pred = mean(pred), 
         prop.l = binom.confint(n, sum(n), method="bayes")$lower,
         prop.h = binom.confint(n, sum(n), method="bayes")$upper)
## Warning: 2 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 3 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 3 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 2 confidence intervals failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
## Warning: 1 confidence interval failed to converge (marked by '*').
##   Try changing 'tol' to a different value.
qplot(trial.cat, prop, col=age.grp,
      geom=c("line","pointrange"), group=age.grp,
      ymin=prop.l, ymax=prop.h,
      data=filter(props.t,category=="nothing")) + 
  geom_line(aes(x=trial.cat, y=pred),lty=2) + 
  xlab("Exposures") + 
  facet_wrap(~age.grp) + 
  ylab("Proportion choosing") + 
  ylim(c(0,1))

plot of chunk unnamed-chunk-2

Perhaps a mixed model would fix this, but the inference is quite slow so I haven’t done it yet.

Add ringers to look at accuracy

Read in and merge. Not as slow as it could be.

r <- read.csv("~/Projects/concept_net/data/whoi_ringers.csv")
rf <- tbl_df(r)
names(rf)[1] <- "true.category"

df <- left_join(df, rf, by="fname")
df$correct <- df$category == df$true.category

Now do analyses on correctness.

prop.corr <- df %>%
  group_by(trial.cat, category, age.grp) %>%
  summarise(corr = mean(correct, na.rm=TRUE))

qplot(trial.cat, corr, col=category, group=category,
      facets=~age.grp,
      geom="line",
      data=prop.corr) + 
  ylim(c(0,1)) +
  ylab("Proportion correct") + 
  xlab("Number of trials completed")

plot of chunk unnamed-chunk-4

Not much in the way of learning effects plotted that way. Let’s do it by age/category since that was useful last time.

qplot(trial.cat, corr, col=age.grp, group=age.grp,
      facets=~category,
      geom="line",
      data=prop.corr) + 
  ylim(c(0,1)) +
  ylab("Proportion correct") + 
  xlab("Number of trials completed")

plot of chunk unnamed-chunk-5

Now aggregate by age since there’s not much going on there?

prop.corr <- df %>%
  group_by(trial.cat, category) %>%
  summarise(corr = mean(correct, na.rm=TRUE))

qplot(trial.cat, corr, col=category, group=category,
      geom="line",
      data=prop.corr) + 
  ylim(c(0,1)) +
  ylab("Proportion correct") + 
  xlab("Number of trials completed")

plot of chunk unnamed-chunk-6