Here, we look at the difference in the distributions across languages in terms of mean, variance, and cohesian.
Read in doctag indices, docvecs, and metadata
doctag_indices <- read_feather("../../../data/processed/models/all.model/doctag_indices_all_model.feather")
docvecs <- read_feather("../../../data/processed/models/all.model/docvecs_all_model.feather") %>%
as.data.frame() %>%
mutate(offset = 0:(n()-1)) %>%
select(offset, everything())
#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../../../data/raw/merged_metadata.csv")
metadata_clean <- metadata %>%
mutate_if(is.character, as.factor) %>%
mutate(essay_id = as.character(essay_id))
Language summaries
kable(count(metadata_clean, L1_code), caption = "L1 distribution")
| L1_code | n |
|---|---|
| ARA | 1100 |
| BEN | 1100 |
| BUL | 1100 |
| CHI | 1100 |
| DUT | 1100 |
| ENG | 1100 |
| FAS | 1100 |
| FRE | 1100 |
| GER | 1100 |
| GRE | 1100 |
| GUJ | 1100 |
| HIN | 1100 |
| IBO | 1100 |
| IND | 1100 |
| ITA | 1100 |
| JPN | 1100 |
| KAN | 1100 |
| KOR | 1100 |
| MAL | 1100 |
| MAR | 1100 |
| NEP | 1100 |
| PAN | 1100 |
| POL | 1100 |
| POR | 1100 |
| RUM | 1100 |
| RUS | 1100 |
| SPA | 1100 |
| TAM | 1100 |
| TEL | 1100 |
| TGL | 1100 |
| THA | 1100 |
| TUR | 1100 |
| URD | 1100 |
| VIE | 1100 |
| YOR | 1100 |
Merge all data sources together
d <- doctag_indices %>%
left_join(metadata_clean) %>%
left_join(docvecs) %>%
nest(-1:-10, .key = "doc_vector") %>%
select(-doc_count, -offset, -test_center_country_code)
Get TSNE coordinates.
#mats <- d %>%
# select(doc_vector) %>%
# unnest() %>%
# as.matrix()
#tsne_out = Rtsne::Rtsne(mats)
#tsne_dims <- tsne_out$Y %>%
# as.data.frame() %>%
# rename(tsne_X = V1,
# tsne_Y = V2) %>%
# bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
# select(everything(), tsne_X, tsne_Y)
# write_csv(tsne_dims, "../../../data/processed/tsne/tsne_dims_cached_all.csv")
tsne_dims <- read_csv("../../../data/processed/tsne/tsne_dims_cached_all.csv")
ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
geom_point(size = .2) +
theme_minimal() +
ggtitle("All essays")
ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
#geom_density2d(alpha=.5, aes(color = prompt_id)) +
facet_wrap(~L1_code)+
geom_point(size = .2) +
#geom_hex(bins = 20) +
theme_minimal()
tsne_centroids_by_prompt <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
split(.$L1_prompt) %>%
map_df(get_group_centroid, "L1_prompt") %>%
separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")
tsne_prompt_mean_centroids <- tsne_centroids_by_prompt %>%
split(.$L1_code) %>%
map_df(~colMeans(.[,-1:-2])) %>%
t() %>%
as.data.frame() %>%
rownames_to_column("L1_code") %>%
rename(tsne_X = V1,
tsne_Y = V2)
ggplot(tsne_centroids_by_prompt, aes(x = tsne_X, y = tsne_Y)) +
geom_point(aes(color = L1_code)) +
facet_wrap( ~prompt_id, scales = "free") +
theme_minimal()
ggplot(tsne_prompt_mean_centroids) +
geom_label(aes(x = tsne_X, y = tsne_Y, label = L1_code, fill = L1_code), size =4) +
#geom_point(aes(x = tsne_X, y = tsne_Y), size = 2, color = 'black') +
#ggrepel::geom_label_repel(
# aes(tsne_X, y = tsne_Y, fill = L1_code, label = L1_code),
# fontface = 'bold', color = 'black',
# box.padding = unit(0.35, "lines"),
# point.padding = unit(0.2, "lines"),
# segment.color = 'black') +
ggtitle("Centroids averaging across prompts") +
theme_minimal() +
theme(legend.position = "none",
text = element_text(size = 15))
Diameter is average pairwise distance between within-language essays in 2D-space.
tsne_diameters_prompt <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
split(.$L1_prompt) %>%
map_df(get_group_diameter, "L1_prompt", "mean_dist") %>%
separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")
tsne_prompt_mean_diameter <- tsne_diameters_prompt %>%
group_by(L1_code) %>%
multi_boot_standard(col = "diameter", na.rm = TRUE)
ggplot(tsne_prompt_mean_diameter, aes(x = reorder(L1_code, mean),
y = mean,
fill = L1_code)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
xlab("L1") +
ggtitle("Diameter, averaging across prompts") +
theme_minimal() +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1))
For each prompt, for each pair of languages, I take ratio of within-group to between group distances. So, take the mean of the pairwise distances amongst all the essays in L1 and amongst all the essays in L2, and the mean pairwise distances between each essay in L1 and in L2 (Between), then compute Between / ((L1 + L2)/2). This number is 1 if there is no clustering, but gets larger with better clustering.
#pairwise_tsne_cohesion <- pmap_df(all_lang_combos2,
# get_2d_coherence_prompt,
# tsne_dims)
#write_feather(pairwise_tsne_cohesion, "../../../data/processed/pairwise_tsne_cohesion_all.feather")
pairwise_tsne_cohesion <- read_feather("../../../data/processed/pairwise_tsne_cohesion_all.feather")
countrywise_cohesions <- pairwise_tsne_cohesion %>%
group_by(L1_code_1, prompt_id) %>%
summarize(cohesion = mean(cohesion)) %>%
group_by(L1_code_1) %>%
multi_boot_standard(col = "cohesion", na.rm = TRUE) %>%
rename(L1_code = L1_code_1) # symmetrical for 1 and 2
ggplot(countrywise_cohesions, aes(x = reorder(L1_code, mean),
y = mean,
fill = L1_code)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
xlab("L1") +
ggtitle("Cohesion, averaging across prompts and pairs") +
theme_minimal() +
theme(legend.position = "none") +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, hjust = 1))
Pairwise country dendogram of cohesion
pairwise_tsne_cohesion_mat <- pairwise_tsne_cohesion %>%
group_by(L1_code_1, L1_code_2) %>%
summarize(cohesion = mean(cohesion)) %>%
select(L1_code_1, L1_code_2, cohesion) %>%
spread(L1_code_2, cohesion) %>%
remove_rownames() %>%
column_to_rownames("L1_code_1") %>%
as.matrix()
dist_matrix <- dist(pairwise_tsne_cohesion_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)
| ARA | BEN | BUL | CHI | DUT | ENG | FAS | FRE | GER | GRE | GUJ | HIN | IBO | IND | ITA | JPN | KAN | KOR | MAL | MAR | NEP | PAN | POL | POR | RUM | RUS | SPA | TAM | TEL | TGL | THA | TUR | URD | VIE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ARA | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| BEN | 0.83 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| BUL | 0.27 | 0.99 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| CHI | 0.10 | 0.90 | 0.24 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| DUT | 0.41 | 0.93 | 0.24 | 0.42 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| ENG | 0.45 | 0.61 | 0.70 | 0.51 | 0.75 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| FAS | 0.10 | 0.91 | 0.22 | 0.05 | 0.39 | 0.53 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| FRE | 0.38 | 1.05 | 0.13 | 0.35 | 0.19 | 0.80 | 0.33 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| GER | 0.53 | 1.03 | 0.31 | 0.53 | 0.13 | 0.90 | 0.49 | 0.22 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| GRE | 0.29 | 0.74 | 0.32 | 0.33 | 0.26 | 0.57 | 0.31 | 0.34 | 0.37 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| GUJ | 0.65 | 0.37 | 0.88 | 0.72 | 0.85 | 0.32 | 0.72 | 0.93 | 0.96 | 0.62 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| HIN | 0.94 | 0.41 | 1.12 | 1.01 | 1.04 | 0.61 | 1.00 | 1.16 | 1.13 | 0.82 | 0.33 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| IBO | 0.45 | 0.55 | 0.67 | 0.52 | 0.67 | 0.20 | 0.53 | 0.74 | 0.78 | 0.44 | 0.25 | 0.53 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| IND | 0.13 | 0.89 | 0.19 | 0.12 | 0.34 | 0.57 | 0.10 | 0.29 | 0.44 | 0.27 | 0.73 | 1.01 | 0.53 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| ITA | 0.49 | 1.09 | 0.24 | 0.48 | 0.19 | 0.92 | 0.44 | 0.13 | 0.16 | 0.40 | 1.00 | 1.21 | 0.82 | 0.38 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| JPN | 0.23 | 0.95 | 0.13 | 0.21 | 0.28 | 0.70 | 0.18 | 0.21 | 0.36 | 0.29 | 0.81 | 1.04 | 0.62 | 0.14 | 0.29 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| KAN | 1.03 | 0.28 | 1.13 | 1.08 | 1.00 | 0.86 | 1.08 | 1.15 | 1.10 | 0.87 | 0.63 | 0.57 | 0.78 | 1.03 | 1.16 | 1.10 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| KOR | 0.11 | 0.85 | 0.24 | 0.09 | 0.37 | 0.54 | 0.08 | 0.33 | 0.47 | 0.28 | 0.68 | 0.94 | 0.49 | 0.07 | 0.42 | 0.16 | 1.02 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| MAL | 0.79 | 0.21 | 0.96 | 0.84 | 0.90 | 0.52 | 0.84 | 1.01 | 1.01 | 0.69 | 0.28 | 0.31 | 0.46 | 0.82 | 1.05 | 0.90 | 0.40 | 0.81 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| MAR | 1.08 | 0.26 | 1.17 | 1.13 | 1.04 | 0.87 | 1.12 | 1.19 | 1.12 | 0.91 | 0.59 | 0.47 | 0.78 | 1.08 | 1.19 | 1.12 | 0.19 | 1.07 | 0.34 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| NEP | 0.80 | 0.17 | 0.88 | 0.85 | 0.73 | 0.70 | 0.84 | 0.88 | 0.80 | 0.61 | 0.47 | 0.55 | 0.61 | 0.78 | 0.88 | 0.83 | 0.33 | 0.78 | 0.29 | 0.36 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| PAN | 0.76 | 0.55 | 0.82 | 0.82 | 0.63 | 0.76 | 0.79 | 0.79 | 0.66 | 0.54 | 0.56 | 0.62 | 0.59 | 0.76 | 0.75 | 0.74 | 0.55 | 0.74 | 0.59 | 0.64 | 0.41 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| POL | 0.21 | 0.90 | 0.09 | 0.21 | 0.16 | 0.60 | 0.18 | 0.11 | 0.23 | 0.25 | 0.77 | 1.04 | 0.57 | 0.15 | 0.20 | 0.10 | 1.02 | 0.17 | 0.90 | 1.13 | 0.78 | 0.71 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| POR | 0.16 | 0.89 | 0.08 | 0.17 | 0.22 | 0.58 | 0.14 | 0.15 | 0.31 | 0.29 | 0.75 | 1.04 | 0.54 | 0.13 | 0.27 | 0.11 | 1.03 | 0.14 | 0.90 | 1.13 | 0.79 | 0.74 | 0.10 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| RUM | 0.09 | 0.85 | 0.15 | 0.11 | 0.29 | 0.54 | 0.10 | 0.23 | 0.40 | 0.29 | 0.71 | 1.01 | 0.50 | 0.11 | 0.36 | 0.16 | 1.00 | 0.11 | 0.87 | 1.09 | 0.76 | 0.75 | 0.17 | 0.09 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| RUS | 0.23 | 0.93 | 0.03 | 0.22 | 0.23 | 0.70 | 0.19 | 0.10 | 0.31 | 0.32 | 0.80 | 1.06 | 0.63 | 0.16 | 0.24 | 0.06 | 1.04 | 0.18 | 0.95 | 1.12 | 0.78 | 0.71 | 0.09 | 0.09 | 0.17 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA |
| SPA | 0.23 | 0.86 | 0.03 | 0.23 | 0.23 | 0.72 | 0.19 | 0.11 | 0.32 | 0.29 | 0.79 | 1.03 | 0.64 | 0.15 | 0.26 | 0.05 | 0.96 | 0.16 | 0.91 | 1.05 | 0.68 | 0.67 | 0.09 | 0.09 | 0.17 | 0.03 | 0.00 | NA | NA | NA | NA | NA | NA | NA |
| TAM | 0.84 | 0.21 | 0.95 | 0.92 | 0.82 | 0.58 | 0.91 | 0.98 | 0.90 | 0.68 | 0.33 | 0.05 | 0.47 | 0.90 | 0.98 | 0.96 | 0.06 | 0.90 | 0.19 | 0.07 | 0.32 | 0.62 | 0.90 | 0.88 | 0.87 | 0.95 | 0.95 | 0.00 | NA | NA | NA | NA | NA | NA |
| TEL | 0.67 | 0.04 | 0.75 | 0.76 | 0.59 | 0.39 | 0.75 | 0.76 | 0.65 | 0.48 | 0.13 | 0.23 | 0.27 | 0.73 | 0.77 | 0.77 | 0.26 | 0.73 | 0.08 | 0.29 | 0.22 | 0.59 | 0.71 | 0.69 | 0.68 | 0.75 | 0.75 | 0.24 | 0.00 | NA | NA | NA | NA | NA |
| TGL | 0.22 | 0.57 | 0.37 | 0.30 | 0.41 | 0.22 | 0.32 | 0.43 | 0.52 | 0.34 | 0.42 | 0.80 | 0.30 | 0.32 | 0.53 | 0.39 | 0.81 | 0.31 | 0.62 | 0.86 | 0.49 | 0.70 | 0.39 | 0.31 | 0.25 | 0.37 | 0.38 | 0.80 | 0.56 | 0.00 | NA | NA | NA | NA |
| THA | 0.22 | 0.77 | 0.07 | 0.20 | 0.18 | 0.64 | 0.16 | 0.06 | 0.22 | 0.26 | 0.70 | 0.97 | 0.63 | 0.13 | 0.16 | 0.07 | 0.95 | 0.14 | 0.82 | 1.00 | 0.60 | 0.60 | 0.03 | 0.12 | 0.20 | 0.06 | 0.08 | 0.94 | 0.79 | 0.45 | 0.00 | NA | NA | NA |
| TUR | 0.05 | 0.68 | 0.17 | 0.04 | 0.29 | 0.55 | 0.07 | 0.27 | 0.41 | 0.27 | 0.60 | 0.90 | 0.54 | 0.08 | 0.40 | 0.18 | 0.86 | 0.06 | 0.75 | 0.89 | 0.53 | 0.68 | 0.23 | 0.10 | 0.06 | 0.17 | 0.17 | 0.86 | 0.71 | 0.32 | 0.22 | 0.00 | NA | NA |
| URD | 0.66 | 0.17 | 0.78 | 0.74 | 0.60 | 0.26 | 0.76 | 0.78 | 0.75 | 0.52 | 0.13 | 0.38 | 0.15 | 0.75 | 0.84 | 0.78 | 0.41 | 0.73 | 0.14 | 0.42 | 0.34 | 0.75 | 0.79 | 0.69 | 0.64 | 0.77 | 0.79 | 0.37 | 0.13 | 0.45 | 0.80 | 0.70 | 0.00 | NA |
| VIE | 0.34 | 0.59 | 0.08 | 0.25 | 0.10 | 0.80 | 0.19 | 0.02 | 0.15 | 0.18 | 0.61 | 0.66 | 0.80 | 0.18 | 0.14 | 0.11 | 0.54 | 0.22 | 0.72 | 0.52 | 0.30 | 0.26 | 0.02 | 0.20 | 0.34 | 0.07 | 0.06 | 0.57 | 0.65 | 0.59 | 0.01 | 0.31 | 0.78 | 0 |
ggdendro::ggdendrogram(hclust(dist_matrix)) +
ggtitle("Proportion overlap of ellipses, averaging across prompts")
Mean overlap excluded here because computationally slow.