Here, we look at the difference in the distributions across languages with mean and variance in original (200) dimensionality.
Read in doctag indices, docvecs, and metadata, and merge
doctag_indices <- read_feather("../../all_data/doctag_indices.feather")
docvecs <- read_feather("../../all_data/docvecs.feather") %>%
as.data.frame() %>%
mutate(offset = 0:(n()-1)) %>%
select(offset, everything())
#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../../all_data/raw/merged_metadata.csv")
metadata_clean <- metadata %>%
mutate_if(is.character, as.factor) %>%
mutate(essay_id = as.character(essay_id))
#Merge all data sources together
d <- doctag_indices %>%
left_join(metadata_clean) %>%
left_join(docvecs) %>%
nest(-1:-10, .key = "doc_vector") %>%
select(-doc_count, -offset, -test_center_country_code) %>%
mutate(prompt_id = fct_recode(prompt_id,
broad.academics = "VC079857",
young.enjoy = "VC139555",
young.help = "VC182427",
advertisements = "VC199494",
fewer.cars = "VC199744",
tour.travel = "VC212639",
students.facts = "VC247638",
success.risks = "VC251653"))
Pairwise distances between languages
centroids_prompt <- d %>%
mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
split(.$L1_prompt) %>%
map_df(get_group_centroid, "L1_prompt") %>%
separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")
prompt_mean_centroids <- centroids_prompt %>%
split(.$L1_code) %>%
map_df(~colMeans(.[,-1:-2])) %>%
t() %>%
as.data.frame() %>%
rownames_to_column("L1_code")
prompt_mean_centroids_mat = as.matrix(prompt_mean_centroids[,-1])
rownames(prompt_mean_centroids_mat) = prompt_mean_centroids[,1]
dist_matrix <- dist(prompt_mean_centroids_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)
| ARA | CHI | FRE | GER | HIN | ITA | JPN | KOR | SPA | TEL | TUR | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| ARA | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| CHI | 1.44 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA | NA |
| FRE | 1.61 | 1.45 | 0.00 | NA | NA | NA | NA | NA | NA | NA | NA |
| GER | 1.87 | 1.76 | 1.17 | 0.00 | NA | NA | NA | NA | NA | NA | NA |
| HIN | 2.02 | 2.20 | 2.14 | 2.03 | 0.00 | NA | NA | NA | NA | NA | NA |
| ITA | 1.71 | 1.70 | 1.08 | 1.44 | 2.44 | 0.00 | NA | NA | NA | NA | NA |
| JPN | 1.62 | 1.10 | 1.65 | 1.99 | 2.59 | 1.70 | 0.00 | NA | NA | NA | NA |
| KOR | 1.65 | 1.04 | 1.68 | 2.01 | 2.44 | 1.80 | 0.73 | 0.00 | NA | NA | NA |
| SPA | 1.27 | 1.44 | 0.96 | 1.31 | 2.08 | 1.08 | 1.61 | 1.66 | 0.00 | NA | NA |
| TEL | 1.84 | 2.07 | 2.32 | 2.30 | 0.94 | 2.53 | 2.43 | 2.29 | 2.18 | 0.00 | NA |
| TUR | 1.19 | 1.19 | 1.22 | 1.50 | 1.86 | 1.47 | 1.40 | 1.36 | 1.12 | 1.84 | 0 |
ggdendro::ggdendrogram(hclust(dist_matrix)) +
ggtitle("High-Dimensional Pairwise Centroid Distance, averaging across prompts")
write_csv(as.data.frame(dist_matrix_kable), "../../all_data/pairwise_country_distances/HD_centroid_distances.csv")
Diameter is average pairwise distance between within-language essays in 2D-space.
d_diameters_prompt <- d %>%
mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
split(.$L1_prompt) %>%
map_df(get_group_diameter, "L1_prompt", "mean_dist") %>%
separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")
d_prompt_mean_diameter <- d_diameters_prompt %>%
group_by(L1_code) %>%
multi_boot_standard(col = "diameter")
ggplot(d_prompt_mean_diameter, aes(x = reorder(L1_code, mean),
y = mean,
fill = L1_code)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
xlab("L1") +
ggtitle("Diameter, averaging across prompts") +
theme_minimal() +
theme(legend.position = "none")