This script shows the basics of how to connect to the Wordbank database, load data from it, and work with that data. We illustrate this with two sample analyses: the first works with aggregate vocabulary size data and breaks down vocabulary development by the child’s sex, and the second works with individual item data and looks at the trajectories of color words over age.

Load required libraries.

library(dplyr)
library(directlabels)
library(RMySQL)
library(tidyr)
library(bootstrap)
library(ggplot2)
library(RCurl)

Get a script that provides interface functions for pulling data out of Wordbank.

url <- 'https://raw.githubusercontent.com/langcog/wordbank/crossling/shiny_apps/data_loading.R'
script <- getURL(url, ssl.verifypeer = FALSE)
eval(parse(text = script))

Connect to the Wordbank database.

wordbank <- src_mysql(dbname='wordbank', host="54.149.39.46", #"54.200.225.86"
                      user="wordbank", password="wordbank")

Get all the common tables.

common.tables <- get.common.tables(wordbank)

Get all the instrument tables.

instrument.tables <- get.instrument.tables(wordbank, common.tables$instrumentsmap)

Load in administration data.

admins <- get.administration.data(common.tables$momed, common.tables$child,
                                  common.tables$instrumentsmap, common.tables$administration) %>%
  filter(!is.na(sex))

Number of administrations by sex for each instrument:

language form female male
Danish WS 1900 1814
English WG 1149 1226
English WS 2027 2151
Norwegian WG 1503 1522
Norwegian WS 6519 6450
Spanish WG 373 405
Spanish WS 541 553

Load in item data.

items <- get.item.data(common.tables$wordmapping,
                       common.tables$instrumentsmap)

Number of items by category for each instrument:

language form word word_form complexity
Danish WS 725 29 33
English WG 396 0 0
English WS 680 25 37
Norwegian WG 395 0 0
Norwegian WS 731 33 42
Spanish WG 428 0 0
Spanish WS 680 24 37

Sample aggregate analysis: sex differences in vocabulary development

Get by-child productive vocabulary data.

num.words <- items %>%
  filter(form == "WS", type == "word") %>%
  group_by(language) %>%
  summarise(n = n())

vocab.data <- admins %>%
  select(admin.id, language, form, age, sex, production) %>%
  filter(form == "WS", age >= 16, age <= 30) %>%
  group_by(language, sex, age) %>%
  left_join(num.words) %>%
  mutate(production = as.numeric(production)/n) %>%
  summarise_each(funs(na.median, ci.median.low, ci.median.high), production)

What these data look like for a given age:

language sex age median confidence interval (lower) confidence interval (upper)
Danish F 24 0.402 0.070 0.038
Danish M 24 0.311 0.048 0.058
English F 24 0.575 0.049 0.043
English M 24 0.406 0.037 0.069
Norwegian F 24 0.456 0.036 0.029
Norwegian M 24 0.377 0.031 0.043
Spanish F 24 0.304 0.057 0.165
Spanish M 24 0.318 0.106 0.126

Plot median productive vocabulary as a function of age.

ggplot(vocab.data,
       aes(x=age, y=na.median,colour=sex,label=sex))+
  facet_wrap(~ language) +
  geom_pointrange(aes(ymin = na.median-ci.median.low,
                      ymax = na.median+ci.median.high),
                  size = .8,
                  show_guide = FALSE) +
  geom_line(size=1) +
  scale_x_continuous(breaks=seq(16,30,2),limits=c(16,31),
                     name = "Age (months)")+
  scale_y_continuous(name = "Median Productive Vocabulary (proportion of total words)",
                     limits=c(0,1)) +
  theme_bw(base_size=14) +
  theme(legend.position="none", panel.grid=element_blank()) +
  geom_dl(method = list(dl.trans(x=x +.2),"last.qp",cex=1)) +
  scale_color_manual(values = c("#2c7bb6", "#d7191c"))

Sample by-item analysis: trajectories of color words

Set up color terms of each language.

colors.danish <- c("r\xf8d", "gul", "gr\xf8n", "bl\xe5", "sort", "hvid") # "lille" "brun"
colors.english <- c("red",  "yellow", "green", "blue", "black", "white") # "orange" "brown"
colors.norwegian <- c ("r\xf8d", "gul", "gr\xf8nn" ,"bl\xe5", "svart", "hvit") # "brun" "oransje"
colors.spanish <- c("rojo", "amarillo", "verde", "azul", "negro", "blanco") # "rosa" "morado"

colors <- list("Danish" = colors.danish,
               "English" = colors.english,
               "Norwegian" = colors.norwegian,
               "Spanish" = colors.spanish)

Function for getting color data for a given language.

get.language.data <- function(lang) {
  
  lang.table <- filter(instrument.tables, language==lang, form=="WS")$table[[1]]

  lang.colors <- colors[[lang]]
  lang.items <- select(filter(items, language==lang, form=="WS"),
                       definition, item.id)
  lang.color.items <- data.frame(color = colors.english, definition = lang.colors) %>%
    left_join(lang.items)
  
  lang.data <- get.instrument.data(lang.table, lang.color.items$item.id) %>%
    mutate(language = lang,
           item.id = paste("item_", item.id, sep="")) %>%
  left_join(lang.color.items)
  
  return(lang.data)
  
}

Load color data for each language and put it all together.

languages <- unique(instrument.tables$language)
color.data <- bind_rows(sapply(languages, get.language.data, simplify=FALSE))

color.byage.data <- color.data %>%
  left_join(filter(admins, form == "WS", age >= 16, age <= 30)) %>%
  group_by(language, color, age) %>%
  mutate(produces = value == "produces") %>%
  summarise_each(funs(na.mean, ci.mean.high, ci.mean.low), produces)

What these data look like for a given age and language:

language color age median confidence interval (lower) confidence interval (upper)
English black 24 0.370 0.047 0.040
English blue 24 0.630 0.045 0.045
English green 24 0.516 0.047 0.047
English red 24 0.558 0.047 0.047
English white 24 0.308 0.045 0.042
English yellow 24 0.541 0.042 0.047

Plot the proportion of kids that produce each color as a function of their age.

ggplot(color.byage.data, aes(x=age, y=na.mean, color=color, label=color)) +
  facet_wrap(~ language) +
  geom_pointrange(aes(ymin = na.mean-ci.mean.low,
                      ymax = na.mean+ci.mean.high),
                  size = .8,
                  show_guide = FALSE) +
  geom_line(size=1) +
  theme_bw(base_size=14) +
  theme(legend.position="none", panel.grid=element_blank()) +
  geom_dl(method = list(dl.trans(x=x +.2), "last.qp", cex=.8)) +
  scale_color_manual(values=c("black", "#2c7bb6", "#1a9641", "#d7191c", "gray87", "gold1")) +
  scale_x_continuous(breaks=seq(16,30,2), limits=c(16,31.5),
                     name = "Age (months)")+
  scale_y_continuous(name = "Proportion of Children Producing",
                     limits=c(0,1))