Longitudinal CDI data

Number of participants/timepoints
Overall vocabulary growth
Semantics analysis

library(knitr)

opts_chunk$set(echo = T, message = F, warning = F, 
               error = F, cache = F, tidy = F)

library(tidyverse)
library(langcog)
library(data.table)
library(feather)

theme_set(theme_classic(base_size = 12))

INPATH1 <- "data/train_sample_longitud_mcdi.csv"
INPATH2 <- "data/test_sample_longitud_mcdi.csv"

tidy_mcdi_data <- read_csv(INPATH1) %>%
  bind_rows(read_csv(INPATH2)) %>%
  select(child_id,  gender, age, session_num, percentile, words_spoken, item, value)

Number of participants/timepoints

n_longitidininal_kids <- tidy_mcdi_data %>%
  distinct(child_id) %>%
  nrow()

n_boys_kids <- tidy_mcdi_data %>%
  distinct(child_id, gender) %>%
  count(gender)

There are 195 children in the sample that have data for more than one timepoint, are monolingual, and are not hard of hearing. There are 96 boys.

n_session_per_child <- tidy_mcdi_data %>%
  distinct(child_id, session_num) %>% 
  count(child_id)

ggplot(n_session_per_child, aes(x = n)) +
  geom_histogram(binwidth = 1) +
  scale_x_continuous(breaks = 0:13)  +
  xlab("number of sessions") +
  ylab("number of children") +
  ggtitle("Number of sessions per child")

Overall vocabulary growth

To visualize all the data, here’s the percentile for each child at each session number, split by child gender. Each child is represented as a line and each session as a point. Red points are timepoints in which the child increased in their percentile relative to the previous timepoint.

 tidy_mcdi_data %>%
  distinct(child_id, session_num, percentile, gender) %>%
  group_by(child_id) %>%
  mutate(previous_percentile = lag(percentile, n = 1, order_by = session_num),
         increase = percentile >= previous_percentile) %>%
  ggplot(aes(x = session_num, group = child_id, y = percentile)) +
   facet_wrap(.~ gender) +
    scale_color_manual(values = c("grey", "red")) +
  scale_x_continuous(breaks = 1:13)  +
  geom_line(color = "black") +
  xlab("session number")+
  geom_point(aes(color = increase))

Mean vocabulary, across children:

mean_percentile_by_age <- tidy_mcdi_data %>%
  filter(age > 13) %>%
  mutate(age_bin = cut(age, breaks = 11:34)) %>%
  group_by(age_bin) %>%
  multi_boot_standard(col = "percentile")

ggplot(mean_percentile_by_age, aes(x = age_bin, y = mean, group = 1)) +
  geom_point() +
  geom_smooth(se = F) +
  xlab("age (months)") +
  ylab("mean percentile") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper))  +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

mean_words_by_age <- tidy_mcdi_data %>%
  filter(age > 13) %>%
  mutate(age_bin = cut(age, breaks = 11:34)) %>%
  group_by(age_bin) %>%
  multi_boot_standard(col = "words_spoken")

ggplot(mean_words_by_age, aes(x = age_bin, y = mean, group = 1)) +
  geom_point() +
  geom_smooth(se = F) +
  ylab("mean words spoken") +
  xlab("age (months)") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Semantics analysis

In this analysis, the words in the CDI were clustered using the Wikipedia emebddings. I then got the the centroid of the cluster in 2D space. Each panel below corresponds to a 4-month age bin, where the size of the circle corresponds to the number of words in that cluster, and the shading corresponds to the mean proportion of words from that cluster that kids know.

ITEM_KEY <- "../11_hypernyms/data/item_key.csv"
item_key <- read_csv(ITEM_KEY) %>%
  select(item, num_item_id)

OUTFILE_CLUSTERS <- "data/cluster_mappings.csv"
cluster_mappings <- read_csv(OUTFILE_CLUSTERS)


total_clusters <- count(cluster_mappings, cluster_id) %>%
  rename(total_words = n)

produced_words <- tidy_mcdi_data %>%
  filter(value > 1) %>%
  select(child_id, age, item) %>%
  left_join(item_key) %>%
  select(-item) %>%
  left_join(cluster_mappings) %>%
  filter(!is.na(cluster_id)) %>%
  filter(age > 13) %>%
  mutate(age_bin = cut(age, breaks = seq(12, 34 , by = 4)))  %>%
  filter(!is.na(age_bin))

prop_know_by_age_cluster <- produced_words %>%
  distinct(child_id, age_bin, cluster_id, item) %>% # some kids have multiple session/data poitn
  count(child_id, age_bin, cluster_id) %>%
  left_join(total_clusters) %>%
  mutate(prop_know_in_clusters = n/total_words) %>%
  group_by(age_bin, cluster_id) %>%
  summarize(mean_prop_know_in_clusters =  mean(prop_know_in_clusters))

#get_cluster_label <- function(current_df){ 
#  print(current_df$item)
#  user_cluster_label <- readline(prompt = "Enter cluster label: ")
#  user_cluster_label
#}

#cluster_labels <- cluster_mappings %>%
#  group_by(cluster_id) %>%
#  nest() %>%
#  mutate(cluster_label = "") %>%
#  mutate(cluster_label = map_chr(data, get_cluster_label)) %>%
#  select(cluster_id, cluster_label)

CLUSTER_NAMES <- "data/cluster_names.csv"
#write_csv(cluster_labels, CLUSTER_NAMES)
cluster_labels <- read_csv(CLUSTER_NAMES)

  
OUTFILE_CENTROIDS <- "data/cluster_centroids.csv"
centroids <- read_csv(OUTFILE_CENTROIDS)

centroid_df <- centroids %>%
  full_join(prop_know_by_age_cluster)  %>%
  left_join(total_clusters) %>%
  left_join(cluster_labels)

centroid_df %>%
  ggplot() +
  ggforce::geom_circle(aes(x0 = tsne_X, y0 = tsne_Y, r = total_words/10,
                           fill = mean_prop_know_in_clusters)) +
  geom_text(aes(x = tsne_X, y = tsne_Y, label = cluster_label), size = 2.7) +
  theme_void() + 
  facet_wrap(.~as.factor(age_bin), ncol = 2) +
  #viridis::scale_fill_viridis(begin =1 ,end = 0) +
  scale_fill_gradient2("Mean proportion words known", 
                       low = "white", high = "red"
                     )  +
  theme(legend.position="bottom")

## Hypernym analysis

Longitudinal CDI data

2018-12-05

Number of participants/timepoints

Overall vocabulary growth

Semantics analysis