Here, we look at the difference in the distributions across languages, with three measures: mean, variance and coherence.

Note that the exact numbers are dependent on the tSNE dimensions (should probably average across models?).


Read in doctag indices, docvecs, and metadata

doctag_indices <- read_feather("../all_data/doctag_indices.feather") 
docvecs <- read_feather("../all_data/docvecs.feather") %>%
  as.data.frame() %>%
  mutate(offset = 0:(n()-1)) %>%
  select(offset, everything())

#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../all_data/merged_metadata.csv") 
metadata_clean <- metadata %>%
  mutate_if(is.character, as.factor)  %>%
  mutate(essay_id = as.character(essay_id))

Language summaries

kable(count(metadata_clean, L1_code), caption = "L1 distribution")
L1 distribution
L1_code n
ARA 1100
CHI 1100
FRE 1100
GER 1100
HIN 1100
ITA 1100
JPN 1100
KOR 1100
SPA 1100
TEL 1100
TUR 1100

Merge all data sources together

d <- doctag_indices %>%
  left_join(metadata_clean) %>%
  left_join(docvecs) %>%
  nest(-1:-10, .key = "doc_vector") %>%
  select(-doc_count, -offset, -test_center_country_code) %>%
  mutate(prompt_id = fct_recode(prompt_id, 
                                broad.academics = "VC079857", 
                                young.enjoy = "VC139555", 
                                young.help = "VC182427",
                                advertisements = "VC199494",
                                fewer.cars = "VC199744",
                                tour.travel = "VC212639",
                                students.facts = "VC247638",
                                success.risks = "VC251653"))

Get TSNE coordinates.

#mats <- d %>%
#  select(doc_vector) %>%
#  unnest() %>%
#  as.matrix()

#tsne_out = Rtsne::Rtsne(mats)
#tsne_dims <- tsne_out$Y %>%
#  as.data.frame() %>%
#  rename(tsne_X = V1,
#         tsne_Y = V2) %>%
#  bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
#  select(everything(), tsne_X, tsne_Y)
# write_csv(tsne_dims, "../all_data/tsne_dims_cached.csv")
tsne_dims <- read_csv("../all_data/tsne_dims_cached.csv")

1 Mean

1.1 Visualize distributions

1.1.1 All

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
  geom_point(size = .2) +
  theme_minimal()

1.1.2 By-language

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
  #geom_density2d(alpha=.5, aes(color = prompt_id)) + 
  facet_wrap(~L1_code)+
  geom_point(size = .2) +
  #geom_hex(bins = 20) +
  theme_minimal()

tsne_dims %>%
  filter(L1_code %in% c("GER", "HIN")) %>%
  ggplot(aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
  geom_density2d(alpha=.5, aes(color = L1_code)) + 
  facet_wrap(~prompt_id)+
  #geom_point(size = .2) +
  #geom_hex(bins = 20) +
  theme_minimal()

1.2 Centroids

1.2.1 Distances in 2-D

tsne_centroids_by_prompt <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_centroid, "L1_prompt") %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_") 

tsne_prompt_mean_centroids <- tsne_centroids_by_prompt %>%
  split(.$L1_code) %>% 
  map_df(~colMeans(.[,-1:-2])) %>%
  t() %>% 
  as.data.frame() %>%
  rownames_to_column("L1_code") %>%
  rename(tsne_X = V1,
         tsne_Y = V2)

ggplot(tsne_centroids_by_prompt, aes(x = tsne_X, y = tsne_Y)) +
 geom_point(aes(color = L1_code)) +
  facet_wrap( ~prompt_id, scales = "free") +
  theme_minimal() 

ggplot(tsne_prompt_mean_centroids) +
geom_point(aes(x = tsne_X, y = tsne_Y), size = 2, color = 'black') +
  ggrepel::geom_label_repel(
    aes(tsne_X, y = tsne_Y, fill = L1_code, label = L1_code),
    fontface = 'bold', color = 'black',
    box.padding = unit(0.35, "lines"),
    point.padding = unit(0.2, "lines"),
    segment.color = 'black') +
  ggtitle("Centroids in 2D, averaging across prompts") +
  theme_minimal() +
  theme(legend.position = "none") 

Pairwise distances between languages

tsne_prompt_mean_centroids_mat <- as.matrix(tsne_prompt_mean_centroids[,-1])
rownames(tsne_prompt_mean_centroids_mat) = tsne_prompt_mean_centroids[,1]

dist_matrix <- dist(tsne_prompt_mean_centroids_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)
ARA CHI FRE GER HIN ITA JPN KOR SPA TEL TUR
ARA 0.00 NA NA NA NA NA NA NA NA NA NA
CHI 0.12 0.00 NA NA NA NA NA NA NA NA NA
FRE 0.47 0.58 0.00 NA NA NA NA NA NA NA NA
GER 0.56 0.66 0.10 0.00 NA NA NA NA NA NA NA
HIN 0.52 0.62 0.48 0.56 0.00 NA NA NA NA NA NA
ITA 0.66 0.74 0.29 0.21 0.76 0.00 NA NA NA NA NA
JPN 0.41 0.52 0.38 0.47 0.12 0.67 0.00 NA NA NA NA
KOR 0.27 0.34 0.54 0.64 0.32 0.80 0.26 0.00 NA NA NA
SPA 0.31 0.32 0.47 0.51 0.75 0.51 0.63 0.57 0.00 NA NA
TEL 0.27 0.39 0.25 0.35 0.32 0.52 0.20 0.29 0.43 0.00 NA
TUR 0.11 0.14 0.46 0.53 0.60 0.60 0.48 0.38 0.20 0.31 0
ggdendro::ggdendrogram(hclust(dist_matrix)) +
  ggtitle("2D Pairwise Centroid Distance, averaging across prompts")

1.2.2 Distances in High-D

Pairwise distances between languages

centroids_prompt <- d %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_centroid, "L1_prompt") %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

prompt_mean_centroids <- centroids_prompt %>%
  split(.$L1_code) %>% 
  map_df(~colMeans(.[,-1:-2])) %>%
  t() %>% 
  as.data.frame() %>%
  rownames_to_column("L1_code")

prompt_mean_centroids_mat = as.matrix(prompt_mean_centroids[,-1])
rownames(prompt_mean_centroids_mat) = prompt_mean_centroids[,1]

dist_matrix <- dist(prompt_mean_centroids_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)
ARA CHI FRE GER HIN ITA JPN KOR SPA TEL TUR
ARA 0.00 NA NA NA NA NA NA NA NA NA NA
CHI 1.44 0.00 NA NA NA NA NA NA NA NA NA
FRE 1.61 1.45 0.00 NA NA NA NA NA NA NA NA
GER 1.87 1.76 1.17 0.00 NA NA NA NA NA NA NA
HIN 2.02 2.20 2.14 2.03 0.00 NA NA NA NA NA NA
ITA 1.71 1.70 1.08 1.44 2.44 0.00 NA NA NA NA NA
JPN 1.62 1.10 1.65 1.99 2.59 1.70 0.00 NA NA NA NA
KOR 1.65 1.04 1.68 2.01 2.44 1.80 0.73 0.00 NA NA NA
SPA 1.27 1.44 0.96 1.31 2.08 1.08 1.61 1.66 0.00 NA NA
TEL 1.84 2.07 2.32 2.30 0.94 2.53 2.43 2.29 2.18 0.00 NA
TUR 1.19 1.19 1.22 1.50 1.86 1.47 1.40 1.36 1.12 1.84 0
ggdendro::ggdendrogram(hclust(dist_matrix)) +
  ggtitle("High-Dimensional Pairwise Centroid Distance, averaging across prompts")

2 Variance

Diameter is average pairwise distance between within-language essays in 2D-space.

tsne_diameters_prompt <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_diameter, "L1_prompt", "mean_dist")  %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

tsne_prompt_mean_diameter <- tsne_diameters_prompt %>%
  group_by(L1_code) %>%
  multi_boot_standard(col = "diameter")

ggplot(tsne_prompt_mean_diameter, aes(x = reorder(L1_code, mean), 
                      y = mean, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  xlab("L1") +
  ggtitle("Diameter, averaging across prompts") +
  theme_minimal() +
  theme(legend.position = "none") 

3 Coherence

3.1 Area overlap

all_lang_combos <- expand.grid(group_1 = as.character(unique(tsne_dims$L1_code)), 
                               group_2 = as.character(unique(tsne_dims$L1_code)))

all_prompts <- rep(unique(tsne_dims$prompt_id), each = nrow(all_lang_combos))

all_lang_combos2 <- do.call("rbind", replicate(length(unique(tsne_dims$prompt_id)), 
                                               all_lang_combos, simplify = FALSE)) %>%
                    bind_cols(prompt = all_prompts) %>%
                    mutate_all(funs(as.character)) %>%
                    filter(group_1 != group_2)


pairwise_tsne_overlap_prompt <- pmap_df(all_lang_combos2,
                                 get_2d_overlap_prompt, 
                                 tsne_dims)

counrtywise_overlap <- pairwise_tsne_overlap_prompt %>%
  group_by(group_1, prompt) %>%
  summarize(p_overlap = mean(p_overlap)) %>%
  group_by(group_1) %>%
  multi_boot_standard(col = "p_overlap") %>%
  rename(L1_code = group_1) # symmetrical for 1 and 2

ggplot(counrtywise_overlap, aes(x = reorder(L1_code, mean), 
                      y = mean, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  xlab("L1") +
  ggtitle("Proportion Overlap, averaging across prompts and pairs") +
  theme_minimal() +
  theme(legend.position = "none") 

Pairwise country dendogram of proportion overlap

pairwise_tsne_overlap_prompt_mat <- pairwise_tsne_overlap_prompt %>%
  group_by(group_1, group_2) %>%
  summarize(p_overlap = mean(p_overlap)) %>%
  mutate(dist_p_overlap = 1 - p_overlap) %>%
  select(group_1, group_2, dist_p_overlap) %>%
  spread(group_2, dist_p_overlap) %>%
  remove_rownames() %>%
  column_to_rownames("group_1") %>%
  as.matrix()

pairwise_tsne_overlap_prompt_mat <- as.matrix(pairwise_tsne_overlap_prompt_mat)

dist_matrix <- dist(pairwise_tsne_overlap_prompt_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)
ARA CHI FRE GER HIN ITA JPN KOR SPA TEL TUR
ARA 0.00 NA NA NA NA NA NA NA NA NA NA
CHI 0.31 0.00 NA NA NA NA NA NA NA NA NA
FRE 0.34 0.47 0.00 NA NA NA NA NA NA NA NA
GER 0.35 0.52 0.18 0.00 NA NA NA NA NA NA NA
HIN 1.16 1.32 1.43 1.38 0.00 NA NA NA NA NA NA
ITA 0.37 0.44 0.21 0.33 1.52 0.00 NA NA NA NA NA
JPN 0.37 0.22 0.59 0.62 1.21 0.61 0.00 NA NA NA NA
KOR 0.36 0.24 0.60 0.63 1.18 0.59 0.09 0.00 NA NA NA
SPA 0.35 0.46 0.09 0.25 1.47 0.14 0.61 0.61 0.00 NA NA
TEL 1.10 1.24 1.34 1.31 0.08 1.44 1.13 1.11 1.37 0.00 NA
TUR 0.27 0.34 0.24 0.26 1.31 0.29 0.46 0.48 0.25 1.25 0
ggdendro::ggdendrogram(hclust(dist_matrix)) +
  ggtitle("Proportion overlap of ellipses, averaging across prompts")

3.2 Between/Within

For each prompt, for each pair of languages, I take ratio of within-group to between group distances. So, take the mean of the pairwise distances amongst all the essays in L1 and amongst all the essays in L2, and the mean pairwise distances between each essay in L1 and in L2 (Between), then compute Between / ((L1 + L2)/2). This number is 1 if there is no clustering, but gets larger with better clustering.

pairwise_tsne_cohesion <- pmap_df(all_lang_combos2,
                                 get_2d_coherence_prompt, 
                                 tsne_dims) 

countrywise_cohesions <- pairwise_tsne_cohesion %>%
  group_by(L1_code_1, prompt_id) %>%
  summarize(cohesion = mean(cohesion)) %>%
  group_by(L1_code_1) %>%
  multi_boot_standard(col = "cohesion") %>%
  rename(L1_code = L1_code_1) # symmetrical for 1 and 2

ggplot(countrywise_cohesions, aes(x = reorder(L1_code, mean), 
                      y = mean, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  xlab("L1") +
  ggtitle("Cohesion, averaging across prompts and pairs") +
  theme_minimal() +
  theme(legend.position = "none") 

Pairwise country dendogram of cohesion

pairwise_tsne_cohesion_mat <- pairwise_tsne_cohesion %>%
  group_by(L1_code_1, L1_code_2) %>%
  summarize(cohesion = mean(cohesion)) %>%
  select(L1_code_1, L1_code_2, cohesion) %>%
  spread(L1_code_2, cohesion) %>%
  remove_rownames() %>%
  column_to_rownames("L1_code_1") %>%
  as.matrix()

dist_matrix <- dist(pairwise_tsne_cohesion_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)
ARA CHI FRE GER HIN ITA JPN KOR SPA TEL TUR
ARA 0.00 NA NA NA NA NA NA NA NA NA NA
CHI 0.08 0.00 NA NA NA NA NA NA NA NA NA
FRE 0.12 0.15 0.00 NA NA NA NA NA NA NA NA
GER 0.24 0.28 0.21 0.00 NA NA NA NA NA NA NA
HIN 1.06 1.07 1.07 0.92 0.00 NA NA NA NA NA NA
ITA 0.33 0.30 0.31 0.38 1.00 0.00 NA NA NA NA NA
JPN 0.33 0.28 0.37 0.40 0.99 0.19 0.00 NA NA NA NA
KOR 0.17 0.11 0.23 0.31 1.04 0.24 0.18 0.00 NA NA NA
SPA 0.15 0.15 0.07 0.21 1.05 0.25 0.31 0.20 0.00 NA NA
TEL 0.97 0.97 0.97 0.79 0.14 0.93 0.93 0.94 0.94 0.00 NA
TUR 0.18 0.22 0.18 0.23 1.05 0.47 0.49 0.32 0.24 0.94 0
ggdendro::ggdendrogram(hclust(dist_matrix)) +
  ggtitle("Proportion overlap of ellipses, averaging across prompts")

Overall, the overlap and cohesion measures are very similiar.