Here, we look at the difference in the distributions across languages with mean and variance in original (200) dimensionality.


Read in doctag indices, docvecs, and metadata, and merge

doctag_indices <- read_feather("../../all_data/doctag_indices.feather") 
docvecs <- read_feather("../../all_data/docvecs.feather") %>%
  as.data.frame() %>%
  mutate(offset = 0:(n()-1)) %>%
  select(offset, everything())

#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../../all_data/raw/merged_metadata.csv") 
metadata_clean <- metadata %>%
  mutate_if(is.character, as.factor)  %>%
  mutate(essay_id = as.character(essay_id))

#Merge all data sources together
d <- doctag_indices %>%
  left_join(metadata_clean) %>%
  left_join(docvecs) %>%
  nest(-1:-10, .key = "doc_vector") %>%
  select(-doc_count, -offset, -test_center_country_code) %>%
  mutate(prompt_id = fct_recode(prompt_id, 
                                broad.academics = "VC079857", 
                                young.enjoy = "VC139555", 
                                young.help = "VC182427",
                                advertisements = "VC199494",
                                fewer.cars = "VC199744",
                                tour.travel = "VC212639",
                                students.facts = "VC247638",
                                success.risks = "VC251653"))

0.1 Centroids

0.1.1 Distances

Pairwise distances between languages

centroids_prompt <- d %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_centroid, "L1_prompt") %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

prompt_mean_centroids <- centroids_prompt %>%
  split(.$L1_code) %>% 
  map_df(~colMeans(.[,-1:-2])) %>%
  t() %>% 
  as.data.frame() %>%
  rownames_to_column("L1_code")

prompt_mean_centroids_mat = as.matrix(prompt_mean_centroids[,-1])
rownames(prompt_mean_centroids_mat) = prompt_mean_centroids[,1]

dist_matrix <- dist(prompt_mean_centroids_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)
ARA CHI FRE GER HIN ITA JPN KOR SPA TEL TUR
ARA 0.00 NA NA NA NA NA NA NA NA NA NA
CHI 1.44 0.00 NA NA NA NA NA NA NA NA NA
FRE 1.61 1.45 0.00 NA NA NA NA NA NA NA NA
GER 1.87 1.76 1.17 0.00 NA NA NA NA NA NA NA
HIN 2.02 2.20 2.14 2.03 0.00 NA NA NA NA NA NA
ITA 1.71 1.70 1.08 1.44 2.44 0.00 NA NA NA NA NA
JPN 1.62 1.10 1.65 1.99 2.59 1.70 0.00 NA NA NA NA
KOR 1.65 1.04 1.68 2.01 2.44 1.80 0.73 0.00 NA NA NA
SPA 1.27 1.44 0.96 1.31 2.08 1.08 1.61 1.66 0.00 NA NA
TEL 1.84 2.07 2.32 2.30 0.94 2.53 2.43 2.29 2.18 0.00 NA
TUR 1.19 1.19 1.22 1.50 1.86 1.47 1.40 1.36 1.12 1.84 0
ggdendro::ggdendrogram(hclust(dist_matrix)) +
  ggtitle("High-Dimensional Pairwise Centroid Distance, averaging across prompts")

write_csv(as.data.frame(dist_matrix_kable), "../../all_data/pairwise_country_distances/HD_centroid_distances.csv")

1 Variance

Diameter is average pairwise distance between within-language essays in 2D-space.

d_diameters_prompt <- d %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_diameter, "L1_prompt", "mean_dist")  %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

d_prompt_mean_diameter <- d_diameters_prompt %>%
  group_by(L1_code) %>%
  multi_boot_standard(col = "diameter")

ggplot(d_prompt_mean_diameter, aes(x = reorder(L1_code, mean), 
                      y = mean, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  xlab("L1") +
  ggtitle("Diameter, averaging across prompts") +
  theme_minimal() +
  theme(legend.position = "none")