library(knitr)
opts_chunk$set(echo = T, message = F, warning = F,
error = F, cache = F, tidy = F)
library(tidyverse)
library(langcog)
library(data.table)
library(feather)
theme_set(theme_classic(base_size = 12))INPATH1 <- "data/train_sample_longitud_mcdi.csv"
INPATH2 <- "data/test_sample_longitud_mcdi.csv"
tidy_mcdi_data <- read_csv(INPATH1) %>%
bind_rows(read_csv(INPATH2)) %>%
select(child_id, gender, age, session_num, percentile, words_spoken, item, value) n_longitidininal_kids <- tidy_mcdi_data %>%
distinct(child_id) %>%
nrow()
n_boys_kids <- tidy_mcdi_data %>%
distinct(child_id, gender) %>%
count(gender)There are 195 children in the sample that have data for more than one timepoint, are monolingual, and are not hard of hearing. There are 96 boys.
n_session_per_child <- tidy_mcdi_data %>%
distinct(child_id, session_num) %>%
count(child_id)
ggplot(n_session_per_child, aes(x = n)) +
geom_histogram(binwidth = 1) +
scale_x_continuous(breaks = 0:13) +
xlab("number of sessions") +
ylab("number of children") +
ggtitle("Number of sessions per child")To visualize all the data, here’s the percentile for each child at each session number, split by child gender. Each child is represented as a line and each session as a point. Red points are timepoints in which the child increased in their percentile relative to the previous timepoint.
tidy_mcdi_data %>%
distinct(child_id, session_num, percentile, gender) %>%
group_by(child_id) %>%
mutate(previous_percentile = lag(percentile, n = 1, order_by = session_num),
increase = percentile >= previous_percentile) %>%
ggplot(aes(x = session_num, group = child_id, y = percentile)) +
facet_wrap(.~ gender) +
scale_color_manual(values = c("grey", "red")) +
scale_x_continuous(breaks = 1:13) +
geom_line(color = "black") +
xlab("session number")+
geom_point(aes(color = increase)) Mean vocabulary, across children:
mean_percentile_by_age <- tidy_mcdi_data %>%
filter(age > 13) %>%
mutate(age_bin = cut(age, breaks = 11:34)) %>%
group_by(age_bin) %>%
multi_boot_standard(col = "percentile")
ggplot(mean_percentile_by_age, aes(x = age_bin, y = mean, group = 1)) +
geom_point() +
geom_smooth(se = F) +
xlab("age (months)") +
ylab("mean percentile") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))mean_words_by_age <- tidy_mcdi_data %>%
filter(age > 13) %>%
mutate(age_bin = cut(age, breaks = 11:34)) %>%
group_by(age_bin) %>%
multi_boot_standard(col = "words_spoken")
ggplot(mean_words_by_age, aes(x = age_bin, y = mean, group = 1)) +
geom_point() +
geom_smooth(se = F) +
ylab("mean words spoken") +
xlab("age (months)") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))In this analysis, the words in the CDI were clustered using the Wikipedia emebddings. I then got the the centroid of the cluster in 2D space. Each panel below corresponds to a 4-month age bin, where the size of the circle corresponds to the number of words in that cluster, and the shading corresponds to the mean proportion of words from that cluster that kids know.
ITEM_KEY <- "../11_hypernyms/data/item_key.csv"
item_key <- read_csv(ITEM_KEY) %>%
select(item, num_item_id)
OUTFILE_CLUSTERS <- "data/cluster_mappings.csv"
cluster_mappings <- read_csv(OUTFILE_CLUSTERS)
total_clusters <- count(cluster_mappings, cluster_id) %>%
rename(total_words = n)
produced_words <- tidy_mcdi_data %>%
filter(value > 1) %>%
select(child_id, age, item) %>%
left_join(item_key) %>%
select(-item) %>%
left_join(cluster_mappings) %>%
filter(!is.na(cluster_id)) %>%
filter(age > 13) %>%
mutate(age_bin = cut(age, breaks = seq(12, 34 , by = 4))) %>%
filter(!is.na(age_bin))
prop_know_by_age_cluster <- produced_words %>%
distinct(child_id, age_bin, cluster_id, item) %>% # some kids have multiple session/data poitn
count(child_id, age_bin, cluster_id) %>%
left_join(total_clusters) %>%
mutate(prop_know_in_clusters = n/total_words) %>%
group_by(age_bin, cluster_id) %>%
summarize(mean_prop_know_in_clusters = mean(prop_know_in_clusters))
#get_cluster_label <- function(current_df){
# print(current_df$item)
# user_cluster_label <- readline(prompt = "Enter cluster label: ")
# user_cluster_label
#}
#cluster_labels <- cluster_mappings %>%
# group_by(cluster_id) %>%
# nest() %>%
# mutate(cluster_label = "") %>%
# mutate(cluster_label = map_chr(data, get_cluster_label)) %>%
# select(cluster_id, cluster_label)
CLUSTER_NAMES <- "data/cluster_names.csv"
#write_csv(cluster_labels, CLUSTER_NAMES)
cluster_labels <- read_csv(CLUSTER_NAMES)
OUTFILE_CENTROIDS <- "data/cluster_centroids.csv"
centroids <- read_csv(OUTFILE_CENTROIDS)
centroid_df <- centroids %>%
full_join(prop_know_by_age_cluster) %>%
left_join(total_clusters) %>%
left_join(cluster_labels)centroid_df %>%
ggplot() +
ggforce::geom_circle(aes(x0 = tsne_X, y0 = tsne_Y, r = total_words/10,
fill = mean_prop_know_in_clusters)) +
geom_text(aes(x = tsne_X, y = tsne_Y, label = cluster_label), size = 2.7) +
theme_void() +
facet_wrap(.~as.factor(age_bin), ncol = 2) +
#viridis::scale_fill_viridis(begin =1 ,end = 0) +
scale_fill_gradient2("Mean proportion words known",
low = "white", high = "red"
) +
theme(legend.position="bottom")## Hypernym analysis