Here, we look at the difference in the distributions across languages, with three measures: mean, variance and coherence.
Note that the exact numbers are dependent on the tSNE dimensions (should probably average across models?).
Read in doctag indices, docvecs, and metadata
doctag_indices <- read_feather("../all_data/doctag_indices.feather")
docvecs <- read_feather("../all_data/docvecs.feather") %>%
as.data.frame() %>%
mutate(offset = 0:(n()-1)) %>%
select(offset, everything())
#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../all_data/merged_metadata.csv")
metadata_clean <- metadata %>%
mutate_if(is.character, as.factor) %>%
mutate(essay_id = as.character(essay_id))
Language summaries
kable(count(metadata_clean, L1_code), caption = "L1 distribution")
L1_code | n |
---|---|
ARA | 1100 |
CHI | 1100 |
FRE | 1100 |
GER | 1100 |
HIN | 1100 |
ITA | 1100 |
JPN | 1100 |
KOR | 1100 |
SPA | 1100 |
TEL | 1100 |
TUR | 1100 |
Merge all data sources together
d <- doctag_indices %>%
left_join(metadata_clean) %>%
left_join(docvecs) %>%
nest(-1:-10, .key = "doc_vector") %>%
select(-doc_count, -offset, -test_center_country_code) %>%
mutate(prompt_id = fct_recode(prompt_id,
broad.academics = "VC079857",
young.enjoy = "VC139555",
young.help = "VC182427",
advertisements = "VC199494",
fewer.cars = "VC199744",
tour.travel = "VC212639",
students.facts = "VC247638",
success.risks = "VC251653"))
Get TSNE coordinates.
#mats <- d %>%
# select(doc_vector) %>%
# unnest() %>%
# as.matrix()
#tsne_out = Rtsne::Rtsne(mats)
#tsne_dims <- tsne_out$Y %>%
# as.data.frame() %>%
# rename(tsne_X = V1,
# tsne_Y = V2) %>%
# bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
# select(everything(), tsne_X, tsne_Y)
# write_csv(tsne_dims, "../all_data/tsne_dims_cached.csv")
tsne_dims <- read_csv("../all_data/tsne_dims_cached.csv")
ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
geom_point(size = .2) +
theme_minimal()
ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
#geom_density2d(alpha=.5, aes(color = prompt_id)) +
facet_wrap(~L1_code)+
geom_point(size = .2) +
#geom_hex(bins = 20) +
theme_minimal()
tsne_dims %>%
filter(L1_code %in% c("GER", "HIN")) %>%
ggplot(aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
geom_density2d(alpha=.5, aes(color = L1_code)) +
facet_wrap(~prompt_id)+
#geom_point(size = .2) +
#geom_hex(bins = 20) +
theme_minimal()
tsne_centroids_by_prompt <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
split(.$L1_prompt) %>%
map_df(get_group_centroid, "L1_prompt") %>%
separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")
tsne_prompt_mean_centroids <- tsne_centroids_by_prompt %>%
split(.$L1_code) %>%
map_df(~colMeans(.[,-1:-2])) %>%
t() %>%
as.data.frame() %>%
rownames_to_column("L1_code") %>%
rename(tsne_X = V1,
tsne_Y = V2)
ggplot(tsne_centroids_by_prompt, aes(x = tsne_X, y = tsne_Y)) +
geom_point(aes(color = L1_code)) +
facet_wrap( ~prompt_id, scales = "free") +
theme_minimal()
ggplot(tsne_prompt_mean_centroids) +
geom_point(aes(x = tsne_X, y = tsne_Y), size = 2, color = 'black') +
ggrepel::geom_label_repel(
aes(tsne_X, y = tsne_Y, fill = L1_code, label = L1_code),
fontface = 'bold', color = 'black',
box.padding = unit(0.35, "lines"),
point.padding = unit(0.2, "lines"),
segment.color = 'black') +
ggtitle("Centroids in 2D, averaging across prompts") +
theme_minimal() +
theme(legend.position = "none")
Pairwise distances between languages
tsne_prompt_mean_centroids_mat <- as.matrix(tsne_prompt_mean_centroids[,-1])
rownames(tsne_prompt_mean_centroids_mat) = tsne_prompt_mean_centroids[,1]
dist_matrix <- dist(tsne_prompt_mean_centroids_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)
ARA | CHI | FRE | GER | HIN | ITA | JPN | KOR | SPA | TEL | TUR | |
---|---|---|---|---|---|---|---|---|---|---|---|
ARA | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
CHI | 0.12 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
FRE | 0.47 | 0.58 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA |
GER | 0.56 | 0.66 | 0.10 | 0.00 | NA | NA | NA | NA | NA | NA | NA |
HIN | 0.52 | 0.62 | 0.48 | 0.56 | 0.00 | NA | NA | NA | NA | NA | NA |
ITA | 0.66 | 0.74 | 0.29 | 0.21 | 0.76 | 0.00 | NA | NA | NA | NA | NA |
JPN | 0.41 | 0.52 | 0.38 | 0.47 | 0.12 | 0.67 | 0.00 | NA | NA | NA | NA |
KOR | 0.27 | 0.34 | 0.54 | 0.64 | 0.32 | 0.80 | 0.26 | 0.00 | NA | NA | NA |
SPA | 0.31 | 0.32 | 0.47 | 0.51 | 0.75 | 0.51 | 0.63 | 0.57 | 0.00 | NA | NA |
TEL | 0.27 | 0.39 | 0.25 | 0.35 | 0.32 | 0.52 | 0.20 | 0.29 | 0.43 | 0.00 | NA |
TUR | 0.11 | 0.14 | 0.46 | 0.53 | 0.60 | 0.60 | 0.48 | 0.38 | 0.20 | 0.31 | 0 |
ggdendro::ggdendrogram(hclust(dist_matrix)) +
ggtitle("2D Pairwise Centroid Distance, averaging across prompts")
Pairwise distances between languages
centroids_prompt <- d %>%
mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
split(.$L1_prompt) %>%
map_df(get_group_centroid, "L1_prompt") %>%
separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")
prompt_mean_centroids <- centroids_prompt %>%
split(.$L1_code) %>%
map_df(~colMeans(.[,-1:-2])) %>%
t() %>%
as.data.frame() %>%
rownames_to_column("L1_code")
prompt_mean_centroids_mat = as.matrix(prompt_mean_centroids[,-1])
rownames(prompt_mean_centroids_mat) = prompt_mean_centroids[,1]
dist_matrix <- dist(prompt_mean_centroids_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)
ARA | CHI | FRE | GER | HIN | ITA | JPN | KOR | SPA | TEL | TUR | |
---|---|---|---|---|---|---|---|---|---|---|---|
ARA | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
CHI | 1.44 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
FRE | 1.61 | 1.45 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA |
GER | 1.87 | 1.76 | 1.17 | 0.00 | NA | NA | NA | NA | NA | NA | NA |
HIN | 2.02 | 2.20 | 2.14 | 2.03 | 0.00 | NA | NA | NA | NA | NA | NA |
ITA | 1.71 | 1.70 | 1.08 | 1.44 | 2.44 | 0.00 | NA | NA | NA | NA | NA |
JPN | 1.62 | 1.10 | 1.65 | 1.99 | 2.59 | 1.70 | 0.00 | NA | NA | NA | NA |
KOR | 1.65 | 1.04 | 1.68 | 2.01 | 2.44 | 1.80 | 0.73 | 0.00 | NA | NA | NA |
SPA | 1.27 | 1.44 | 0.96 | 1.31 | 2.08 | 1.08 | 1.61 | 1.66 | 0.00 | NA | NA |
TEL | 1.84 | 2.07 | 2.32 | 2.30 | 0.94 | 2.53 | 2.43 | 2.29 | 2.18 | 0.00 | NA |
TUR | 1.19 | 1.19 | 1.22 | 1.50 | 1.86 | 1.47 | 1.40 | 1.36 | 1.12 | 1.84 | 0 |
ggdendro::ggdendrogram(hclust(dist_matrix)) +
ggtitle("High-Dimensional Pairwise Centroid Distance, averaging across prompts")
Diameter is average pairwise distance between within-language essays in 2D-space.
tsne_diameters_prompt <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
split(.$L1_prompt) %>%
map_df(get_group_diameter, "L1_prompt", "mean_dist") %>%
separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")
tsne_prompt_mean_diameter <- tsne_diameters_prompt %>%
group_by(L1_code) %>%
multi_boot_standard(col = "diameter")
ggplot(tsne_prompt_mean_diameter, aes(x = reorder(L1_code, mean),
y = mean,
fill = L1_code)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
xlab("L1") +
ggtitle("Diameter, averaging across prompts") +
theme_minimal() +
theme(legend.position = "none")
all_lang_combos <- expand.grid(group_1 = as.character(unique(tsne_dims$L1_code)),
group_2 = as.character(unique(tsne_dims$L1_code)))
all_prompts <- rep(unique(tsne_dims$prompt_id), each = nrow(all_lang_combos))
all_lang_combos2 <- do.call("rbind", replicate(length(unique(tsne_dims$prompt_id)),
all_lang_combos, simplify = FALSE)) %>%
bind_cols(prompt = all_prompts) %>%
mutate_all(funs(as.character)) %>%
filter(group_1 != group_2)
pairwise_tsne_overlap_prompt <- pmap_df(all_lang_combos2,
get_2d_overlap_prompt,
tsne_dims)
counrtywise_overlap <- pairwise_tsne_overlap_prompt %>%
group_by(group_1, prompt) %>%
summarize(p_overlap = mean(p_overlap)) %>%
group_by(group_1) %>%
multi_boot_standard(col = "p_overlap") %>%
rename(L1_code = group_1) # symmetrical for 1 and 2
ggplot(counrtywise_overlap, aes(x = reorder(L1_code, mean),
y = mean,
fill = L1_code)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
xlab("L1") +
ggtitle("Proportion Overlap, averaging across prompts and pairs") +
theme_minimal() +
theme(legend.position = "none")
Pairwise country dendogram of proportion overlap
pairwise_tsne_overlap_prompt_mat <- pairwise_tsne_overlap_prompt %>%
group_by(group_1, group_2) %>%
summarize(p_overlap = mean(p_overlap)) %>%
mutate(dist_p_overlap = 1 - p_overlap) %>%
select(group_1, group_2, dist_p_overlap) %>%
spread(group_2, dist_p_overlap) %>%
remove_rownames() %>%
column_to_rownames("group_1") %>%
as.matrix()
pairwise_tsne_overlap_prompt_mat <- as.matrix(pairwise_tsne_overlap_prompt_mat)
dist_matrix <- dist(pairwise_tsne_overlap_prompt_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)
ARA | CHI | FRE | GER | HIN | ITA | JPN | KOR | SPA | TEL | TUR | |
---|---|---|---|---|---|---|---|---|---|---|---|
ARA | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
CHI | 0.31 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
FRE | 0.34 | 0.47 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA |
GER | 0.35 | 0.52 | 0.18 | 0.00 | NA | NA | NA | NA | NA | NA | NA |
HIN | 1.16 | 1.32 | 1.43 | 1.38 | 0.00 | NA | NA | NA | NA | NA | NA |
ITA | 0.37 | 0.44 | 0.21 | 0.33 | 1.52 | 0.00 | NA | NA | NA | NA | NA |
JPN | 0.37 | 0.22 | 0.59 | 0.62 | 1.21 | 0.61 | 0.00 | NA | NA | NA | NA |
KOR | 0.36 | 0.24 | 0.60 | 0.63 | 1.18 | 0.59 | 0.09 | 0.00 | NA | NA | NA |
SPA | 0.35 | 0.46 | 0.09 | 0.25 | 1.47 | 0.14 | 0.61 | 0.61 | 0.00 | NA | NA |
TEL | 1.10 | 1.24 | 1.34 | 1.31 | 0.08 | 1.44 | 1.13 | 1.11 | 1.37 | 0.00 | NA |
TUR | 0.27 | 0.34 | 0.24 | 0.26 | 1.31 | 0.29 | 0.46 | 0.48 | 0.25 | 1.25 | 0 |
ggdendro::ggdendrogram(hclust(dist_matrix)) +
ggtitle("Proportion overlap of ellipses, averaging across prompts")
For each prompt, for each pair of languages, I take ratio of within-group to between group distances. So, take the mean of the pairwise distances amongst all the essays in L1 and amongst all the essays in L2, and the mean pairwise distances between each essay in L1 and in L2 (Between), then compute Between / ((L1 + L2)/2). This number is 1 if there is no clustering, but gets larger with better clustering.
pairwise_tsne_cohesion <- pmap_df(all_lang_combos2,
get_2d_coherence_prompt,
tsne_dims)
countrywise_cohesions <- pairwise_tsne_cohesion %>%
group_by(L1_code_1, prompt_id) %>%
summarize(cohesion = mean(cohesion)) %>%
group_by(L1_code_1) %>%
multi_boot_standard(col = "cohesion") %>%
rename(L1_code = L1_code_1) # symmetrical for 1 and 2
ggplot(countrywise_cohesions, aes(x = reorder(L1_code, mean),
y = mean,
fill = L1_code)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
xlab("L1") +
ggtitle("Cohesion, averaging across prompts and pairs") +
theme_minimal() +
theme(legend.position = "none")
Pairwise country dendogram of cohesion
pairwise_tsne_cohesion_mat <- pairwise_tsne_cohesion %>%
group_by(L1_code_1, L1_code_2) %>%
summarize(cohesion = mean(cohesion)) %>%
select(L1_code_1, L1_code_2, cohesion) %>%
spread(L1_code_2, cohesion) %>%
remove_rownames() %>%
column_to_rownames("L1_code_1") %>%
as.matrix()
dist_matrix <- dist(pairwise_tsne_cohesion_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)
ARA | CHI | FRE | GER | HIN | ITA | JPN | KOR | SPA | TEL | TUR | |
---|---|---|---|---|---|---|---|---|---|---|---|
ARA | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
CHI | 0.08 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
FRE | 0.12 | 0.15 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA |
GER | 0.24 | 0.28 | 0.21 | 0.00 | NA | NA | NA | NA | NA | NA | NA |
HIN | 1.06 | 1.07 | 1.07 | 0.92 | 0.00 | NA | NA | NA | NA | NA | NA |
ITA | 0.33 | 0.30 | 0.31 | 0.38 | 1.00 | 0.00 | NA | NA | NA | NA | NA |
JPN | 0.33 | 0.28 | 0.37 | 0.40 | 0.99 | 0.19 | 0.00 | NA | NA | NA | NA |
KOR | 0.17 | 0.11 | 0.23 | 0.31 | 1.04 | 0.24 | 0.18 | 0.00 | NA | NA | NA |
SPA | 0.15 | 0.15 | 0.07 | 0.21 | 1.05 | 0.25 | 0.31 | 0.20 | 0.00 | NA | NA |
TEL | 0.97 | 0.97 | 0.97 | 0.79 | 0.14 | 0.93 | 0.93 | 0.94 | 0.94 | 0.00 | NA |
TUR | 0.18 | 0.22 | 0.18 | 0.23 | 1.05 | 0.47 | 0.49 | 0.32 | 0.24 | 0.94 | 0 |
ggdendro::ggdendrogram(hclust(dist_matrix)) +
ggtitle("Proportion overlap of ellipses, averaging across prompts")
Overall, the overlap and cohesion measures are very similiar.