Preliminaries.

rm(list=ls())
library(ggplot2)
library(plyr)
library(reshape2)
library(dplyr)
library(stringr)
library(tidyr)
library(markdown)
library(directlabels)

theme_set(theme_bw())

Read data.

df_turk=read.csv("../data/FW_TurkData.csv") #Turk data
df_cdm=read.csv("../data/CDMlangsurvey_analysis.csv") #CDM data
df_info=read.csv("../data/FW_childesinfo.csv") #Childesinfo data
df_turk$birth <- factor(df_turk$birth_order)
df_turk$age.grp <- cut(df_turk$age,breaks=c(5,10,14,18,24))
df_turk$currage.grp <- cut(df_turk$age_current, breaks=c(-1,2,4,6,8,10,12,14,16,18))
df_turk$agesplit <- cut(df_turk$age, breaks=c(0,12,24))
df_cdm$agesplit <- cut(df_cdm$age, breaks=c(0,12,24))
df_info$agesplit <- cut(df_info$age, breaks=c(0,12,24))

bind together.

df_turk$dataset <- "MTurk"
df_cdm$dataset <- "CDM"
df_info$dataset <- "Info"

d <- rbind.fill(df_turk,df_cdm, df_info)

Pull in wordbank data

wordbank <- src_mysql(dbname='wordbank',host="54.200.225.86", 
                      user="wordbank",password="wordbank")

## NOW LOAD TABLES ##
admin.table <- tbl(wordbank,"common_administration")
child.table <- tbl(wordbank,"common_child")
wg.table <- tbl(wordbank,"instruments_wg")

# Select just the MCDI words from the Words and Sentences Table
wg.vocab.words <- select(wg.table,basetable_ptr_id,
                         col_baabaa:col_some) %>%
  as.data.frame %>%
  rename(id = basetable_ptr_id) %>% # Rename the id
  gather(word,produces,col_baabaa:col_some) %>% # Arrange in longform
  mutate(word = str_replace(word, "col_", "")) %>%
  as.tbl# Strip off col_ from words

# Compute a productive vocabulary for each child
wg.scores <- wg.vocab.words %>%
  group_by(id) %>% # Group by child
  summarise(productive = sum(produces == 2)) # Compute productive vocabulary

Load Demographic data for each child

# Get the age of each child
admins <- admin.table %>%
  select(data_id,child_id,age,source_id) %>%
  rename(id = data_id, child.id = child_id,source.id = source_id) %>%
  as.data.frame

# Get demographic variables for each child
demos <- select(child.table,id,gender,mom_ed,birth_order) %>%
  rename(child.id = id) %>%
  as.data.frame# Rename id fields

# Join age and demographics together
child.data <- as.tbl(left_join(admins,demos))

Combine MDI data and demographics

wg.data <- left_join(wg.scores, child.data) %>%
  filter(age >= 8 & age <= 16) %>% # filter down to just relevant range
  select(-child.id,-source.id) #drop redundant columns

Do analysis over all data for CDI-cats.

freqs <- d %>% 
  filter(!is.na(agesplit), cdi_cat != "N/A") %>%
  group_by(dataset, agesplit, cdi_cat) %>%
  summarise(n = n()) %>%
  group_by(dataset, agesplit, add=FALSE) %>%
  mutate(prop = n / sum(n))

Plot.

freqs$cdi_cat <- factor(freqs$cdi_cat, 
                                  levels=unique(with(freqs, cdi_cat
                                  [order(freqs$prop, cdi_cat, 
                                  decreasing = FALSE)])))


ggplot(data = freqs, 
       aes(x=prop, y=cdi_cat, colour=dataset, group=dataset,shape=agesplit)) + 
  geom_point() +
  ylab("MB-CDI category") + 
  xlab("Proportion of Total") +
  scale_color_discrete(name="Dataset")

Age data - set that up.

wg.producers <- wg.data %>%
  group_by(age) %>%
  summarise(prop = sum(productive > 0)/n()) %>%
  mutate(dataset = "Wordbank")

ns <- d %>% 
  filter(!is.na(age), age != "NA") %>%
  mutate(dataset = factor(dataset), 
         age = floor(age)) %>%
  group_by(dataset, age) %>%
  summarise(n = n()) # round to the nearest complete month

freqs.grid <- expand.grid(dataset=c(levels(ns$dataset),"Wordbank"),
                          age=unique(ns$age)) %>%
  arrange(dataset,age)

freqs <- left_join(freqs.grid,ns) %>%
  filter(dataset == "MTurk" | dataset == "CDM" & age >= 10| 
           dataset == "Info" & age >=10 | dataset == "Wordbank" & age >= 8) %>%
  group_by() %>%
  mutate(n = ifelse(is.na(n), 0, n)) %>%
  group_by(dataset) %>%
  mutate(cum.n = cumsum(n),
         prop = cum.n / sum(n))

freqs <- left_join(freqs,wg.producers,by=c("dataset","age")) %>%
  group_by(dataset,age) %>%
  mutate(prop = min(prop.x,prop.y, 1,na.rm = TRUE)) %>%
  select(-prop.x,-prop.y)

Plot.

ggplot(data = freqs, 
       aes(x = age, y = prop, colour=dataset, group=dataset))+
  geom_line(size=1.5) + 
  ylab("Cumulative Probability of First Word")+
  xlab("Age (months)") + 
  geom_hline(yintercept=.75, lty=3) + 
  geom_vline(aes(xintercept=age[prop>.75][1]), lty=3) + 
  scale_x_continuous(breaks=seq(0,24,4))+
  scale_color_brewer(name="Dataset",palette="Set1")+
  theme_bw(base_size=14) +
  theme(legend.position=c(.85,.3)) 

CDI CATS WITH AGE SPLIT at 12m

First - Turk data

#quartz()
freqs <- ddply(df_turk, .(cdi_cat, word_standard, agesplit), summarise, 
               count=length(word_standard)) 

#Normalizing within the age split
freqs <- freqs %>%
group_by(agesplit) %>%
mutate(prop = count/sum(count)) 
#sorting so that the graph looks cool
freqs$cdi_cat <- factor(freqs$cdi_cat, 
                                  levels=unique(with(freqs, cdi_cat
                                  [order(freqs$prop, cdi_cat, 
                                  decreasing = TRUE)])))
#plot of the proportion within the age split with CDI_cat as first word, faceted by the age split - note, N/A categories are excluded
qplot(cdi_cat, prop, geom="bar", position="dodge", stat="identity",
           data=subset(freqs, count>1 & word_standard != "N/A" & cdi_cat != "N/A" & agesplit != "NA")) + 
  theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5))+
  ylab("Proportion of Children with Utterance") + xlab("CDI Categories")+facet_wrap(~agesplit)

Second - CDM data

freqs <- ddply(df_cdm, .(cdi_cat, word_standard, agesplit), summarise, 
               count=length(word_standard))
#normalizing
freqs <- freqs %>%
group_by(agesplit) %>%
mutate(prop = count/sum(count)) 
#sorting
freqs$cdi_cat <- factor(freqs$cdi_cat, 
                                  levels=unique(with(freqs, cdi_cat
                                  [order(freqs$prop, cdi_cat, 
                                  decreasing = TRUE)])))

qplot(cdi_cat, prop, geom="bar", position="dodge", stat="identity",
           data=subset(freqs, count>1 & word_standard != "N/A" & cdi_cat != "N/A" & agesplit != "NA")) + 
  theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5))+
  ylab("Proportion of Children with Utterance") + xlab("CDI Categories")+facet_wrap(~agesplit)