Accessing the Wordbank Database

This script shows the basics of how to connect to the Wordbank database, load data from it, and work with that data. We illustrate this with two sample analyses: the first works with aggregate vocabulary size data and breaks down vocabulary development by the child’s sex, and the second works with individual item data and looks at the trajectories of color words over age.

Load required libraries.

library(dplyr)
library(directlabels)
library(RMySQL)
library(tidyr)
library(bootstrap)
library(ggplot2)
library(RCurl)

Get a script that provides interface functions for pulling data out of Wordbank.

url <- 'https://raw.githubusercontent.com/langcog/wordbank/crossling/shiny_apps/data_loading.R'
script <- getURL(url, ssl.verifypeer = FALSE)
eval(parse(text = script))

Connect to the Wordbank database.

wordbank <- src_mysql(dbname='wordbank', host="54.149.39.46", #"54.200.225.86"
                      user="wordbank", password="wordbank")

Get all the common tables.

common.tables <- get.common.tables(wordbank)

Get all the instrument tables.

instrument.tables <- get.instrument.tables(wordbank, common.tables$instrumentsmap)

Load in administration data.

admins <- get.administration.data(common.tables$momed, common.tables$child,
                                  common.tables$instrumentsmap, common.tables$administration) %>%
  filter(!is.na(sex))

Number of administrations by sex for each instrument:

language	form	female	male
Danish	WS	1900	1814
English	WG	1149	1226
English	WS	2027	2151
Norwegian	WG	1503	1522
Norwegian	WS	6519	6450
Spanish	WG	373	405
Spanish	WS	541	553

Load in item data.

items <- get.item.data(common.tables$wordmapping,
                       common.tables$instrumentsmap)

Number of items by category for each instrument:

language	form	word	word_form	complexity
Danish	WS	725	29	33
English	WG	396	0	0
English	WS	680	25	37
Norwegian	WG	395	0	0
Norwegian	WS	731	33	42
Spanish	WG	428	0	0
Spanish	WS	680	24	37

Sample aggregate analysis: sex differences in vocabulary development

Get by-child productive vocabulary data.

num.words <- items %>%
  filter(form == "WS", type == "word") %>%
  group_by(language) %>%
  summarise(n = n())

vocab.data <- admins %>%
  select(admin.id, language, form, age, sex, production) %>%
  filter(form == "WS", age >= 16, age <= 30) %>%
  group_by(language, sex, age) %>%
  left_join(num.words) %>%
  mutate(production = as.numeric(production)/n) %>%
  summarise_each(funs(na.median, ci.median.low, ci.median.high), production)

What these data look like for a given age:

language	sex	age	median	confidence interval (lower)	confidence interval (upper)
Danish	F	24	0.402	0.070	0.038
Danish	M	24	0.311	0.048	0.058
English	F	24	0.575	0.049	0.043
English	M	24	0.406	0.037	0.069
Norwegian	F	24	0.456	0.036	0.029
Norwegian	M	24	0.377	0.031	0.043
Spanish	F	24	0.304	0.057	0.165
Spanish	M	24	0.318	0.106	0.126

Plot median productive vocabulary as a function of age.

ggplot(vocab.data,
       aes(x=age, y=na.median,colour=sex,label=sex))+
  facet_wrap(~ language) +
  geom_pointrange(aes(ymin = na.median-ci.median.low,
                      ymax = na.median+ci.median.high),
                  size = .8,
                  show_guide = FALSE) +
  geom_line(size=1) +
  scale_x_continuous(breaks=seq(16,30,2),limits=c(16,31),
                     name = "Age (months)")+
  scale_y_continuous(name = "Median Productive Vocabulary (proportion of total words)",
                     limits=c(0,1)) +
  theme_bw(base_size=14) +
  theme(legend.position="none", panel.grid=element_blank()) +
  geom_dl(method = list(dl.trans(x=x +.2),"last.qp",cex=1)) +
  scale_color_manual(values = c("#2c7bb6", "#d7191c"))

Sample by-item analysis: trajectories of color words

Set up color terms of each language.

colors.danish <- c("r\xf8d", "gul", "gr\xf8n", "bl\xe5", "sort", "hvid") # "lille" "brun"
colors.english <- c("red",  "yellow", "green", "blue", "black", "white") # "orange" "brown"
colors.norwegian <- c ("r\xf8d", "gul", "gr\xf8nn" ,"bl\xe5", "svart", "hvit") # "brun" "oransje"
colors.spanish <- c("rojo", "amarillo", "verde", "azul", "negro", "blanco") # "rosa" "morado"

colors <- list("Danish" = colors.danish,
               "English" = colors.english,
               "Norwegian" = colors.norwegian,
               "Spanish" = colors.spanish)

Function for getting color data for a given language.

get.language.data <- function(lang) {
  
  lang.table <- filter(instrument.tables, language==lang, form=="WS")$table[[1]]

  lang.colors <- colors[[lang]]
  lang.items <- select(filter(items, language==lang, form=="WS"),
                       definition, item.id)
  lang.color.items <- data.frame(color = colors.english, definition = lang.colors) %>%
    left_join(lang.items)
  
  lang.data <- get.instrument.data(lang.table, lang.color.items$item.id) %>%
    mutate(language = lang,
           item.id = paste("item_", item.id, sep="")) %>%
  left_join(lang.color.items)
  
  return(lang.data)
  
}

Load color data for each language and put it all together.

languages <- unique(instrument.tables$language)
color.data <- bind_rows(sapply(languages, get.language.data, simplify=FALSE))

color.byage.data <- color.data %>%
  left_join(filter(admins, form == "WS", age >= 16, age <= 30)) %>%
  group_by(language, color, age) %>%
  mutate(produces = value == "produces") %>%
  summarise_each(funs(na.mean, ci.mean.high, ci.mean.low), produces)

What these data look like for a given age and language:

language	color	age	median	confidence interval (lower)	confidence interval (upper)
English	black	24	0.370	0.047	0.040
English	blue	24	0.630	0.045	0.045
English	green	24	0.516	0.047	0.047
English	red	24	0.558	0.047	0.047
English	white	24	0.308	0.045	0.042
English	yellow	24	0.541	0.042	0.047

Plot the proportion of kids that produce each color as a function of their age.

ggplot(color.byage.data, aes(x=age, y=na.mean, color=color, label=color)) +
  facet_wrap(~ language) +
  geom_pointrange(aes(ymin = na.mean-ci.mean.low,
                      ymax = na.mean+ci.mean.high),
                  size = .8,
                  show_guide = FALSE) +
  geom_line(size=1) +
  theme_bw(base_size=14) +
  theme(legend.position="none", panel.grid=element_blank()) +
  geom_dl(method = list(dl.trans(x=x +.2), "last.qp", cex=.8)) +
  scale_color_manual(values=c("black", "#2c7bb6", "#1a9641", "#d7191c", "gray87", "gold1")) +
  scale_x_continuous(breaks=seq(16,30,2), limits=c(16,31.5),
                     name = "Age (months)")+
  scale_y_continuous(name = "Proportion of Children Producing",
                     limits=c(0,1))

Accessing the Wordbank Database

Dan Yurovsky and Mika Braginsky

March 03, 2015

Sample aggregate analysis: sex differences in vocabulary development

Sample by-item analysis: trajectories of color words