Here, we look at the difference in the distributions across languages in terms of mean, variance, and cohesian.


Read in doctag indices, docvecs, and metadata

doctag_indices <- read_feather("../../../data/processed/models/all.model/doctag_indices_all_model.feather") 
docvecs <- read_feather("../../../data/processed/models/all.model/docvecs_all_model.feather") %>%
  as.data.frame() %>%
  mutate(offset = 0:(n()-1)) %>%
  select(offset, everything())

#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../../../data/raw/merged_metadata.csv") 
metadata_clean <- metadata %>%
  mutate_if(is.character, as.factor)  %>%
  mutate(essay_id = as.character(essay_id))

Language summaries

kable(count(metadata_clean, L1_code), caption = "L1 distribution")
L1 distribution
L1_code n
ARA 1100
BEN 1100
BUL 1100
CHI 1100
DUT 1100
ENG 1100
FAS 1100
FRE 1100
GER 1100
GRE 1100
GUJ 1100
HIN 1100
IBO 1100
IND 1100
ITA 1100
JPN 1100
KAN 1100
KOR 1100
MAL 1100
MAR 1100
NEP 1100
PAN 1100
POL 1100
POR 1100
RUM 1100
RUS 1100
SPA 1100
TAM 1100
TEL 1100
TGL 1100
THA 1100
TUR 1100
URD 1100
VIE 1100
YOR 1100

Merge all data sources together

d <- doctag_indices %>%
  left_join(metadata_clean) %>%
  left_join(docvecs) %>%
  nest(-1:-10, .key = "doc_vector") %>%
  select(-doc_count, -offset, -test_center_country_code) 

Get TSNE coordinates.

#mats <- d %>%
#  select(doc_vector) %>%
#  unnest() %>%
#  as.matrix()

#tsne_out = Rtsne::Rtsne(mats)
#tsne_dims <- tsne_out$Y %>%
#  as.data.frame() %>%
#  rename(tsne_X = V1,
#         tsne_Y = V2) %>%
#  bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
#  select(everything(), tsne_X, tsne_Y)
# write_csv(tsne_dims, "../../../data/processed/tsne/tsne_dims_cached_all.csv")
tsne_dims <- read_csv("../../../data/processed/tsne/tsne_dims_cached_all.csv")

1 Mean

1.1 Visualize distributions

1.1.1 All

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
  geom_point(size = .2) +
  theme_minimal() +
  ggtitle("All essays")

1.1.2 By-language

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
  #geom_density2d(alpha=.5, aes(color = prompt_id)) + 
  facet_wrap(~L1_code)+
  geom_point(size = .2) +
  #geom_hex(bins = 20) +
  theme_minimal()

1.2 Centroids

tsne_centroids_by_prompt <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_centroid, "L1_prompt") %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_") 

tsne_prompt_mean_centroids <- tsne_centroids_by_prompt %>%
  split(.$L1_code) %>% 
  map_df(~colMeans(.[,-1:-2])) %>%
  t() %>% 
  as.data.frame() %>%
  rownames_to_column("L1_code") %>%
  rename(tsne_X = V1,
         tsne_Y = V2)

ggplot(tsne_centroids_by_prompt, aes(x = tsne_X, y = tsne_Y)) +
 geom_point(aes(color = L1_code)) +
  facet_wrap( ~prompt_id, scales = "free") +
  theme_minimal() 

ggplot(tsne_prompt_mean_centroids) +
  geom_label(aes(x = tsne_X, y = tsne_Y, label = L1_code, fill = L1_code), size =4) +
#geom_point(aes(x = tsne_X, y = tsne_Y), size = 2, color = 'black') +
  #ggrepel::geom_label_repel(
  #  aes(tsne_X, y = tsne_Y, fill = L1_code, label = L1_code),
   # fontface = 'bold', color = 'black',
   # box.padding = unit(0.35, "lines"),
   # point.padding = unit(0.2, "lines"),
   # segment.color = 'black') +
  ggtitle("Centroids averaging across prompts") +
  theme_minimal() +
  theme(legend.position = "none",
        text = element_text(size = 15)) 

2 Variance

Diameter is average pairwise distance between within-language essays in 2D-space.

tsne_diameters_prompt <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_diameter, "L1_prompt", "mean_dist")  %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

tsne_prompt_mean_diameter <- tsne_diameters_prompt %>%
  group_by(L1_code) %>%
  multi_boot_standard(col = "diameter", na.rm = TRUE)

ggplot(tsne_prompt_mean_diameter, aes(x = reorder(L1_code, mean), 
                      y = mean, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  xlab("L1") +
  ggtitle("Diameter, averaging across prompts") +
  theme_minimal() +
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90, hjust = 1))

3 Coherence

3.1 Between/Within

For each prompt, for each pair of languages, I take ratio of within-group to between group distances. So, take the mean of the pairwise distances amongst all the essays in L1 and amongst all the essays in L2, and the mean pairwise distances between each essay in L1 and in L2 (Between), then compute Between / ((L1 + L2)/2). This number is 1 if there is no clustering, but gets larger with better clustering.

#pairwise_tsne_cohesion <- pmap_df(all_lang_combos2,
#                                 get_2d_coherence_prompt, 
#                                 tsne_dims) 

#write_feather(pairwise_tsne_cohesion, "../../../data/processed/pairwise_tsne_cohesion_all.feather")

pairwise_tsne_cohesion <- read_feather("../../../data/processed/pairwise_tsne_cohesion_all.feather")

countrywise_cohesions <- pairwise_tsne_cohesion %>%
  group_by(L1_code_1, prompt_id) %>%
  summarize(cohesion = mean(cohesion)) %>%
  group_by(L1_code_1) %>%
  multi_boot_standard(col = "cohesion", na.rm = TRUE) %>%
  rename(L1_code = L1_code_1) # symmetrical for 1 and 2

ggplot(countrywise_cohesions, aes(x = reorder(L1_code, mean), 
                      y = mean, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  xlab("L1") +
  ggtitle("Cohesion, averaging across prompts and pairs") +
  theme_minimal() +
  theme(legend.position = "none")  +
   theme(legend.position = "none",
        axis.text.x = element_text(angle = 90, hjust = 1))

Pairwise country dendogram of cohesion

pairwise_tsne_cohesion_mat <- pairwise_tsne_cohesion %>%
  group_by(L1_code_1, L1_code_2) %>%
  summarize(cohesion = mean(cohesion)) %>%
  select(L1_code_1, L1_code_2, cohesion) %>%
  spread(L1_code_2, cohesion) %>%
  remove_rownames() %>%
  column_to_rownames("L1_code_1") %>%
  as.matrix()

dist_matrix <- dist(pairwise_tsne_cohesion_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)
ARA BEN BUL CHI DUT ENG FAS FRE GER GRE GUJ HIN IBO IND ITA JPN KAN KOR MAL MAR NEP PAN POL POR RUM RUS SPA TAM TEL TGL THA TUR URD VIE
ARA 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
BEN 0.83 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
BUL 0.27 0.99 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
CHI 0.10 0.90 0.24 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
DUT 0.41 0.93 0.24 0.42 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
ENG 0.45 0.61 0.70 0.51 0.75 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
FAS 0.10 0.91 0.22 0.05 0.39 0.53 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
FRE 0.38 1.05 0.13 0.35 0.19 0.80 0.33 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
GER 0.53 1.03 0.31 0.53 0.13 0.90 0.49 0.22 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
GRE 0.29 0.74 0.32 0.33 0.26 0.57 0.31 0.34 0.37 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
GUJ 0.65 0.37 0.88 0.72 0.85 0.32 0.72 0.93 0.96 0.62 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
HIN 0.94 0.41 1.12 1.01 1.04 0.61 1.00 1.16 1.13 0.82 0.33 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
IBO 0.45 0.55 0.67 0.52 0.67 0.20 0.53 0.74 0.78 0.44 0.25 0.53 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
IND 0.13 0.89 0.19 0.12 0.34 0.57 0.10 0.29 0.44 0.27 0.73 1.01 0.53 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
ITA 0.49 1.09 0.24 0.48 0.19 0.92 0.44 0.13 0.16 0.40 1.00 1.21 0.82 0.38 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
JPN 0.23 0.95 0.13 0.21 0.28 0.70 0.18 0.21 0.36 0.29 0.81 1.04 0.62 0.14 0.29 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
KAN 1.03 0.28 1.13 1.08 1.00 0.86 1.08 1.15 1.10 0.87 0.63 0.57 0.78 1.03 1.16 1.10 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
KOR 0.11 0.85 0.24 0.09 0.37 0.54 0.08 0.33 0.47 0.28 0.68 0.94 0.49 0.07 0.42 0.16 1.02 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
MAL 0.79 0.21 0.96 0.84 0.90 0.52 0.84 1.01 1.01 0.69 0.28 0.31 0.46 0.82 1.05 0.90 0.40 0.81 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
MAR 1.08 0.26 1.17 1.13 1.04 0.87 1.12 1.19 1.12 0.91 0.59 0.47 0.78 1.08 1.19 1.12 0.19 1.07 0.34 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA NA
NEP 0.80 0.17 0.88 0.85 0.73 0.70 0.84 0.88 0.80 0.61 0.47 0.55 0.61 0.78 0.88 0.83 0.33 0.78 0.29 0.36 0.00 NA NA NA NA NA NA NA NA NA NA NA NA NA
PAN 0.76 0.55 0.82 0.82 0.63 0.76 0.79 0.79 0.66 0.54 0.56 0.62 0.59 0.76 0.75 0.74 0.55 0.74 0.59 0.64 0.41 0.00 NA NA NA NA NA NA NA NA NA NA NA NA
POL 0.21 0.90 0.09 0.21 0.16 0.60 0.18 0.11 0.23 0.25 0.77 1.04 0.57 0.15 0.20 0.10 1.02 0.17 0.90 1.13 0.78 0.71 0.00 NA NA NA NA NA NA NA NA NA NA NA
POR 0.16 0.89 0.08 0.17 0.22 0.58 0.14 0.15 0.31 0.29 0.75 1.04 0.54 0.13 0.27 0.11 1.03 0.14 0.90 1.13 0.79 0.74 0.10 0.00 NA NA NA NA NA NA NA NA NA NA
RUM 0.09 0.85 0.15 0.11 0.29 0.54 0.10 0.23 0.40 0.29 0.71 1.01 0.50 0.11 0.36 0.16 1.00 0.11 0.87 1.09 0.76 0.75 0.17 0.09 0.00 NA NA NA NA NA NA NA NA NA
RUS 0.23 0.93 0.03 0.22 0.23 0.70 0.19 0.10 0.31 0.32 0.80 1.06 0.63 0.16 0.24 0.06 1.04 0.18 0.95 1.12 0.78 0.71 0.09 0.09 0.17 0.00 NA NA NA NA NA NA NA NA
SPA 0.23 0.86 0.03 0.23 0.23 0.72 0.19 0.11 0.32 0.29 0.79 1.03 0.64 0.15 0.26 0.05 0.96 0.16 0.91 1.05 0.68 0.67 0.09 0.09 0.17 0.03 0.00 NA NA NA NA NA NA NA
TAM 0.84 0.21 0.95 0.92 0.82 0.58 0.91 0.98 0.90 0.68 0.33 0.05 0.47 0.90 0.98 0.96 0.06 0.90 0.19 0.07 0.32 0.62 0.90 0.88 0.87 0.95 0.95 0.00 NA NA NA NA NA NA
TEL 0.67 0.04 0.75 0.76 0.59 0.39 0.75 0.76 0.65 0.48 0.13 0.23 0.27 0.73 0.77 0.77 0.26 0.73 0.08 0.29 0.22 0.59 0.71 0.69 0.68 0.75 0.75 0.24 0.00 NA NA NA NA NA
TGL 0.22 0.57 0.37 0.30 0.41 0.22 0.32 0.43 0.52 0.34 0.42 0.80 0.30 0.32 0.53 0.39 0.81 0.31 0.62 0.86 0.49 0.70 0.39 0.31 0.25 0.37 0.38 0.80 0.56 0.00 NA NA NA NA
THA 0.22 0.77 0.07 0.20 0.18 0.64 0.16 0.06 0.22 0.26 0.70 0.97 0.63 0.13 0.16 0.07 0.95 0.14 0.82 1.00 0.60 0.60 0.03 0.12 0.20 0.06 0.08 0.94 0.79 0.45 0.00 NA NA NA
TUR 0.05 0.68 0.17 0.04 0.29 0.55 0.07 0.27 0.41 0.27 0.60 0.90 0.54 0.08 0.40 0.18 0.86 0.06 0.75 0.89 0.53 0.68 0.23 0.10 0.06 0.17 0.17 0.86 0.71 0.32 0.22 0.00 NA NA
URD 0.66 0.17 0.78 0.74 0.60 0.26 0.76 0.78 0.75 0.52 0.13 0.38 0.15 0.75 0.84 0.78 0.41 0.73 0.14 0.42 0.34 0.75 0.79 0.69 0.64 0.77 0.79 0.37 0.13 0.45 0.80 0.70 0.00 NA
VIE 0.34 0.59 0.08 0.25 0.10 0.80 0.19 0.02 0.15 0.18 0.61 0.66 0.80 0.18 0.14 0.11 0.54 0.22 0.72 0.52 0.30 0.26 0.02 0.20 0.34 0.07 0.06 0.57 0.65 0.59 0.01 0.31 0.78 0
ggdendro::ggdendrogram(hclust(dist_matrix)) +
  ggtitle("Proportion overlap of ellipses, averaging across prompts")

Mean overlap excluded here because computationally slow.