Preliminaries.
rm(list=ls())
library(ggplot2)
library(plyr)
library(reshape2)
library(dplyr)
library(stringr)
library(tidyr)
library(markdown)
library(directlabels)
theme_set(theme_bw())
Read data.
df_turk=read.csv("../data/FW_TurkData.csv") #Turk data
df_cdm=read.csv("../data/CDMlangsurvey_analysis.csv") #CDM data
df_info=read.csv("../data/FW_childesinfo.csv") #Childesinfo data
df_turk$birth <- factor(df_turk$birth_order)
df_turk$age.grp <- cut(df_turk$age,breaks=c(5,10,14,18,24))
df_turk$currage.grp <- cut(df_turk$age_current, breaks=c(-1,2,4,6,8,10,12,14,16,18))
df_turk$agesplit <- cut(df_turk$age, breaks=c(0,12,24))
df_cdm$agesplit <- cut(df_cdm$age, breaks=c(0,12,24))
df_info$agesplit <- cut(df_info$age, breaks=c(0,12,24))
bind together.
df_turk$dataset <- "MTurk"
df_cdm$dataset <- "CDM"
df_info$dataset <- "Info"
d <- rbind.fill(df_turk,df_cdm, df_info)
Pull in wordbank data
wordbank <- src_mysql(dbname='wordbank',host="54.200.225.86",
user="wordbank",password="wordbank")
## NOW LOAD TABLES ##
admin.table <- tbl(wordbank,"common_administration")
child.table <- tbl(wordbank,"common_child")
wg.table <- tbl(wordbank,"instruments_wg")
# Select just the MCDI words from the Words and Sentences Table
wg.vocab.words <- select(wg.table,basetable_ptr_id,
col_baabaa:col_some) %>%
as.data.frame %>%
rename(id = basetable_ptr_id) %>% # Rename the id
gather(word,produces,col_baabaa:col_some) %>% # Arrange in longform
mutate(word = str_replace(word, "col_", "")) %>%
as.tbl# Strip off col_ from words
# Compute a productive vocabulary for each child
wg.scores <- wg.vocab.words %>%
group_by(id) %>% # Group by child
summarise(productive = sum(produces == 2)) # Compute productive vocabulary
Load Demographic data for each child
# Get the age of each child
admins <- admin.table %>%
select(data_id,child_id,age,source_id) %>%
rename(id = data_id, child.id = child_id,source.id = source_id) %>%
as.data.frame
# Get demographic variables for each child
demos <- select(child.table,id,gender,mom_ed,birth_order) %>%
rename(child.id = id) %>%
as.data.frame# Rename id fields
# Join age and demographics together
child.data <- as.tbl(left_join(admins,demos))
Combine MDI data and demographics
wg.data <- left_join(wg.scores, child.data) %>%
filter(age >= 8 & age <= 16) %>% # filter down to just relevant range
select(-child.id,-source.id) #drop redundant columns
Do analysis over all data for CDI-cats.
freqs <- d %>%
filter(!is.na(agesplit), cdi_cat != "N/A") %>%
group_by(dataset, agesplit, cdi_cat) %>%
summarise(n = n()) %>%
group_by(dataset, agesplit, add=FALSE) %>%
mutate(prop = n / sum(n))
Plot.
freqs$cdi_cat <- factor(freqs$cdi_cat,
levels=unique(with(freqs, cdi_cat
[order(freqs$prop, cdi_cat,
decreasing = FALSE)])))
ggplot(data = freqs,
aes(x=prop, y=cdi_cat, colour=dataset, group=dataset,shape=agesplit)) +
geom_point() +
ylab("MB-CDI category") +
xlab("Proportion of Total") +
scale_color_discrete(name="Dataset")
Age data - set that up.
wg.producers <- wg.data %>%
group_by(age) %>%
summarise(prop = sum(productive > 0)/n()) %>%
mutate(dataset = "Wordbank")
ns <- d %>%
filter(!is.na(age), age != "NA") %>%
mutate(dataset = factor(dataset),
age = floor(age)) %>%
group_by(dataset, age) %>%
summarise(n = n()) # round to the nearest complete month
freqs.grid <- expand.grid(dataset=c(levels(ns$dataset),"Wordbank"),
age=unique(ns$age)) %>%
arrange(dataset,age)
freqs <- left_join(freqs.grid,ns) %>%
filter(dataset == "MTurk" | dataset == "CDM" & age >= 10|
dataset == "Info" & age >=10 | dataset == "Wordbank" & age >= 8) %>%
group_by() %>%
mutate(n = ifelse(is.na(n), 0, n)) %>%
group_by(dataset) %>%
mutate(cum.n = cumsum(n),
prop = cum.n / sum(n))
freqs <- left_join(freqs,wg.producers,by=c("dataset","age")) %>%
group_by(dataset,age) %>%
mutate(prop = min(prop.x,prop.y, 1,na.rm = TRUE)) %>%
select(-prop.x,-prop.y)
Plot.
ggplot(data = freqs,
aes(x = age, y = prop, colour=dataset, group=dataset))+
geom_line(size=1.5) +
ylab("Cumulative Probability of First Word")+
xlab("Age (months)") +
geom_hline(yintercept=.75, lty=3) +
geom_vline(aes(xintercept=age[prop>.75][1]), lty=3) +
scale_x_continuous(breaks=seq(0,24,4))+
scale_color_brewer(name="Dataset",palette="Set1")+
theme_bw(base_size=14) +
theme(legend.position=c(.85,.3))
CDI CATS WITH AGE SPLIT at 12m
First - Turk data
#quartz()
freqs <- ddply(df_turk, .(cdi_cat, word_standard, agesplit), summarise,
count=length(word_standard))
#Normalizing within the age split
freqs <- freqs %>%
group_by(agesplit) %>%
mutate(prop = count/sum(count))
#sorting so that the graph looks cool
freqs$cdi_cat <- factor(freqs$cdi_cat,
levels=unique(with(freqs, cdi_cat
[order(freqs$prop, cdi_cat,
decreasing = TRUE)])))
#plot of the proportion within the age split with CDI_cat as first word, faceted by the age split - note, N/A categories are excluded
qplot(cdi_cat, prop, geom="bar", position="dodge", stat="identity",
data=subset(freqs, count>1 & word_standard != "N/A" & cdi_cat != "N/A" & agesplit != "NA")) +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5))+
ylab("Proportion of Children with Utterance") + xlab("CDI Categories")+facet_wrap(~agesplit)
Second - CDM data
freqs <- ddply(df_cdm, .(cdi_cat, word_standard, agesplit), summarise,
count=length(word_standard))
#normalizing
freqs <- freqs %>%
group_by(agesplit) %>%
mutate(prop = count/sum(count))
#sorting
freqs$cdi_cat <- factor(freqs$cdi_cat,
levels=unique(with(freqs, cdi_cat
[order(freqs$prop, cdi_cat,
decreasing = TRUE)])))
qplot(cdi_cat, prop, geom="bar", position="dodge", stat="identity",
data=subset(freqs, count>1 & word_standard != "N/A" & cdi_cat != "N/A" & agesplit != "NA")) +
theme(axis.text.x=element_text(angle=90, hjust=1, vjust=.5))+
ylab("Proportion of Children with Utterance") + xlab("CDI Categories")+facet_wrap(~agesplit)