1 Mean
- 1.1 Visualize distributions
  - 1.1.1 All
  - 1.1.2 By-language
- 1.2 Centroids
  - 1.2.1 Distances in 2-D
  - 1.2.2 Distances in High-D
2 Variance
3 Coherence
- 3.1 Area overlap
- 3.2 Between/Within

Here, we look at the difference in the distributions across languages, with three measures: mean, variance and coherence.

Note that the exact numbers are dependent on the tSNE dimensions (should probably average across models?).

Read in doctag indices, docvecs, and metadata

doctag_indices <- read_feather("../all_data/doctag_indices.feather") 
docvecs <- read_feather("../all_data/docvecs.feather") %>%
  as.data.frame() %>%
  mutate(offset = 0:(n()-1)) %>%
  select(offset, everything())

#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../all_data/merged_metadata.csv") 
metadata_clean <- metadata %>%
  mutate_if(is.character, as.factor)  %>%
  mutate(essay_id = as.character(essay_id))

Language summaries

kable(count(metadata_clean, L1_code), caption = "L1 distribution")

L1 distribution
L1_code	n
ARA	1100
CHI	1100
FRE	1100
GER	1100
HIN	1100
ITA	1100
JPN	1100
KOR	1100
SPA	1100
TEL	1100
TUR	1100

Merge all data sources together

d <- doctag_indices %>%
  left_join(metadata_clean) %>%
  left_join(docvecs) %>%
  nest(-1:-10, .key = "doc_vector") %>%
  select(-doc_count, -offset, -test_center_country_code) %>%
  mutate(prompt_id = fct_recode(prompt_id, 
                                broad.academics = "VC079857", 
                                young.enjoy = "VC139555", 
                                young.help = "VC182427",
                                advertisements = "VC199494",
                                fewer.cars = "VC199744",
                                tour.travel = "VC212639",
                                students.facts = "VC247638",
                                success.risks = "VC251653"))

Get TSNE coordinates.

#mats <- d %>%
#  select(doc_vector) %>%
#  unnest() %>%
#  as.matrix()

#tsne_out = Rtsne::Rtsne(mats)
#tsne_dims <- tsne_out$Y %>%
#  as.data.frame() %>%
#  rename(tsne_X = V1,
#         tsne_Y = V2) %>%
#  bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
#  select(everything(), tsne_X, tsne_Y)
# write_csv(tsne_dims, "../all_data/tsne_dims_cached.csv")
tsne_dims <- read_csv("../all_data/tsne_dims_cached.csv")

1 Mean

1.1 Visualize distributions

1.1.1 All

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
  geom_point(size = .2) +
  theme_minimal()

1.1.2 By-language

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
  #geom_density2d(alpha=.5, aes(color = prompt_id)) + 
  facet_wrap(~L1_code)+
  geom_point(size = .2) +
  #geom_hex(bins = 20) +
  theme_minimal()

tsne_dims %>%
  filter(L1_code %in% c("GER", "HIN")) %>%
  ggplot(aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
  geom_density2d(alpha=.5, aes(color = L1_code)) + 
  facet_wrap(~prompt_id)+
  #geom_point(size = .2) +
  #geom_hex(bins = 20) +
  theme_minimal()

1.2 Centroids

1.2.1 Distances in 2-D

tsne_centroids_by_prompt <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_centroid, "L1_prompt") %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_") 

tsne_prompt_mean_centroids <- tsne_centroids_by_prompt %>%
  split(.$L1_code) %>% 
  map_df(~colMeans(.[,-1:-2])) %>%
  t() %>% 
  as.data.frame() %>%
  rownames_to_column("L1_code") %>%
  rename(tsne_X = V1,
         tsne_Y = V2)

ggplot(tsne_centroids_by_prompt, aes(x = tsne_X, y = tsne_Y)) +
 geom_point(aes(color = L1_code)) +
  facet_wrap( ~prompt_id, scales = "free") +
  theme_minimal()

ggplot(tsne_prompt_mean_centroids) +
geom_point(aes(x = tsne_X, y = tsne_Y), size = 2, color = 'black') +
  ggrepel::geom_label_repel(
    aes(tsne_X, y = tsne_Y, fill = L1_code, label = L1_code),
    fontface = 'bold', color = 'black',
    box.padding = unit(0.35, "lines"),
    point.padding = unit(0.2, "lines"),
    segment.color = 'black') +
  ggtitle("Centroids in 2D, averaging across prompts") +
  theme_minimal() +
  theme(legend.position = "none")

Pairwise distances between languages

tsne_prompt_mean_centroids_mat <- as.matrix(tsne_prompt_mean_centroids[,-1])
rownames(tsne_prompt_mean_centroids_mat) = tsne_prompt_mean_centroids[,1]

dist_matrix <- dist(tsne_prompt_mean_centroids_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)

	ARA	CHI	FRE	GER	HIN	ITA	JPN	KOR	SPA	TEL	TUR
ARA	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
CHI	0.12	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA
FRE	0.47	0.58	0.00	NA	NA	NA	NA	NA	NA	NA	NA
GER	0.56	0.66	0.10	0.00	NA	NA	NA	NA	NA	NA	NA
HIN	0.52	0.62	0.48	0.56	0.00	NA	NA	NA	NA	NA	NA
ITA	0.66	0.74	0.29	0.21	0.76	0.00	NA	NA	NA	NA	NA
JPN	0.41	0.52	0.38	0.47	0.12	0.67	0.00	NA	NA	NA	NA
KOR	0.27	0.34	0.54	0.64	0.32	0.80	0.26	0.00	NA	NA	NA
SPA	0.31	0.32	0.47	0.51	0.75	0.51	0.63	0.57	0.00	NA	NA
TEL	0.27	0.39	0.25	0.35	0.32	0.52	0.20	0.29	0.43	0.00	NA
TUR	0.11	0.14	0.46	0.53	0.60	0.60	0.48	0.38	0.20	0.31	0

ggdendro::ggdendrogram(hclust(dist_matrix)) +
  ggtitle("2D Pairwise Centroid Distance, averaging across prompts")

1.2.2 Distances in High-D

Pairwise distances between languages

centroids_prompt <- d %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_centroid, "L1_prompt") %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

prompt_mean_centroids <- centroids_prompt %>%
  split(.$L1_code) %>% 
  map_df(~colMeans(.[,-1:-2])) %>%
  t() %>% 
  as.data.frame() %>%
  rownames_to_column("L1_code")

prompt_mean_centroids_mat = as.matrix(prompt_mean_centroids[,-1])
rownames(prompt_mean_centroids_mat) = prompt_mean_centroids[,1]

dist_matrix <- dist(prompt_mean_centroids_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)

	ARA	CHI	FRE	GER	HIN	ITA	JPN	KOR	SPA	TEL	TUR
ARA	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
CHI	1.44	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA
FRE	1.61	1.45	0.00	NA	NA	NA	NA	NA	NA	NA	NA
GER	1.87	1.76	1.17	0.00	NA	NA	NA	NA	NA	NA	NA
HIN	2.02	2.20	2.14	2.03	0.00	NA	NA	NA	NA	NA	NA
ITA	1.71	1.70	1.08	1.44	2.44	0.00	NA	NA	NA	NA	NA
JPN	1.62	1.10	1.65	1.99	2.59	1.70	0.00	NA	NA	NA	NA
KOR	1.65	1.04	1.68	2.01	2.44	1.80	0.73	0.00	NA	NA	NA
SPA	1.27	1.44	0.96	1.31	2.08	1.08	1.61	1.66	0.00	NA	NA
TEL	1.84	2.07	2.32	2.30	0.94	2.53	2.43	2.29	2.18	0.00	NA
TUR	1.19	1.19	1.22	1.50	1.86	1.47	1.40	1.36	1.12	1.84	0

ggdendro::ggdendrogram(hclust(dist_matrix)) +
  ggtitle("High-Dimensional Pairwise Centroid Distance, averaging across prompts")

2 Variance

Diameter is average pairwise distance between within-language essays in 2D-space.

tsne_diameters_prompt <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_diameter, "L1_prompt", "mean_dist")  %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

tsne_prompt_mean_diameter <- tsne_diameters_prompt %>%
  group_by(L1_code) %>%
  multi_boot_standard(col = "diameter")

ggplot(tsne_prompt_mean_diameter, aes(x = reorder(L1_code, mean), 
                      y = mean, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  xlab("L1") +
  ggtitle("Diameter, averaging across prompts") +
  theme_minimal() +
  theme(legend.position = "none")

3 Coherence

3.1 Area overlap

all_lang_combos <- expand.grid(group_1 = as.character(unique(tsne_dims$L1_code)), 
                               group_2 = as.character(unique(tsne_dims$L1_code)))

all_prompts <- rep(unique(tsne_dims$prompt_id), each = nrow(all_lang_combos))

all_lang_combos2 <- do.call("rbind", replicate(length(unique(tsne_dims$prompt_id)), 
                                               all_lang_combos, simplify = FALSE)) %>%
                    bind_cols(prompt = all_prompts) %>%
                    mutate_all(funs(as.character)) %>%
                    filter(group_1 != group_2)


pairwise_tsne_overlap_prompt <- pmap_df(all_lang_combos2,
                                 get_2d_overlap_prompt, 
                                 tsne_dims)

counrtywise_overlap <- pairwise_tsne_overlap_prompt %>%
  group_by(group_1, prompt) %>%
  summarize(p_overlap = mean(p_overlap)) %>%
  group_by(group_1) %>%
  multi_boot_standard(col = "p_overlap") %>%
  rename(L1_code = group_1) # symmetrical for 1 and 2

ggplot(counrtywise_overlap, aes(x = reorder(L1_code, mean), 
                      y = mean, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  xlab("L1") +
  ggtitle("Proportion Overlap, averaging across prompts and pairs") +
  theme_minimal() +
  theme(legend.position = "none")

Pairwise country dendogram of proportion overlap

pairwise_tsne_overlap_prompt_mat <- pairwise_tsne_overlap_prompt %>%
  group_by(group_1, group_2) %>%
  summarize(p_overlap = mean(p_overlap)) %>%
  mutate(dist_p_overlap = 1 - p_overlap) %>%
  select(group_1, group_2, dist_p_overlap) %>%
  spread(group_2, dist_p_overlap) %>%
  remove_rownames() %>%
  column_to_rownames("group_1") %>%
  as.matrix()

pairwise_tsne_overlap_prompt_mat <- as.matrix(pairwise_tsne_overlap_prompt_mat)

dist_matrix <- dist(pairwise_tsne_overlap_prompt_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)

	ARA	CHI	FRE	GER	HIN	ITA	JPN	KOR	SPA	TEL	TUR
ARA	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
CHI	0.31	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA
FRE	0.34	0.47	0.00	NA	NA	NA	NA	NA	NA	NA	NA
GER	0.35	0.52	0.18	0.00	NA	NA	NA	NA	NA	NA	NA
HIN	1.16	1.32	1.43	1.38	0.00	NA	NA	NA	NA	NA	NA
ITA	0.37	0.44	0.21	0.33	1.52	0.00	NA	NA	NA	NA	NA
JPN	0.37	0.22	0.59	0.62	1.21	0.61	0.00	NA	NA	NA	NA
KOR	0.36	0.24	0.60	0.63	1.18	0.59	0.09	0.00	NA	NA	NA
SPA	0.35	0.46	0.09	0.25	1.47	0.14	0.61	0.61	0.00	NA	NA
TEL	1.10	1.24	1.34	1.31	0.08	1.44	1.13	1.11	1.37	0.00	NA
TUR	0.27	0.34	0.24	0.26	1.31	0.29	0.46	0.48	0.25	1.25	0

ggdendro::ggdendrogram(hclust(dist_matrix)) +
  ggtitle("Proportion overlap of ellipses, averaging across prompts")

3.2 Between/Within

For each prompt, for each pair of languages, I take ratio of within-group to between group distances. So, take the mean of the pairwise distances amongst all the essays in L1 and amongst all the essays in L2, and the mean pairwise distances between each essay in L1 and in L2 (Between), then compute Between / ((L1 + L2)/2). This number is 1 if there is no clustering, but gets larger with better clustering.

pairwise_tsne_cohesion <- pmap_df(all_lang_combos2,
                                 get_2d_coherence_prompt, 
                                 tsne_dims) 

countrywise_cohesions <- pairwise_tsne_cohesion %>%
  group_by(L1_code_1, prompt_id) %>%
  summarize(cohesion = mean(cohesion)) %>%
  group_by(L1_code_1) %>%
  multi_boot_standard(col = "cohesion") %>%
  rename(L1_code = L1_code_1) # symmetrical for 1 and 2

ggplot(countrywise_cohesions, aes(x = reorder(L1_code, mean), 
                      y = mean, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  xlab("L1") +
  ggtitle("Cohesion, averaging across prompts and pairs") +
  theme_minimal() +
  theme(legend.position = "none")

Pairwise country dendogram of cohesion

pairwise_tsne_cohesion_mat <- pairwise_tsne_cohesion %>%
  group_by(L1_code_1, L1_code_2) %>%
  summarize(cohesion = mean(cohesion)) %>%
  select(L1_code_1, L1_code_2, cohesion) %>%
  spread(L1_code_2, cohesion) %>%
  remove_rownames() %>%
  column_to_rownames("L1_code_1") %>%
  as.matrix()

dist_matrix <- dist(pairwise_tsne_cohesion_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)

	ARA	CHI	FRE	GER	HIN	ITA	JPN	KOR	SPA	TEL	TUR
ARA	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
CHI	0.08	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA
FRE	0.12	0.15	0.00	NA	NA	NA	NA	NA	NA	NA	NA
GER	0.24	0.28	0.21	0.00	NA	NA	NA	NA	NA	NA	NA
HIN	1.06	1.07	1.07	0.92	0.00	NA	NA	NA	NA	NA	NA
ITA	0.33	0.30	0.31	0.38	1.00	0.00	NA	NA	NA	NA	NA
JPN	0.33	0.28	0.37	0.40	0.99	0.19	0.00	NA	NA	NA	NA
KOR	0.17	0.11	0.23	0.31	1.04	0.24	0.18	0.00	NA	NA	NA
SPA	0.15	0.15	0.07	0.21	1.05	0.25	0.31	0.20	0.00	NA	NA
TEL	0.97	0.97	0.97	0.79	0.14	0.93	0.93	0.94	0.94	0.00	NA
TUR	0.18	0.22	0.18	0.23	1.05	0.47	0.49	0.32	0.24	0.94	0

ggdendro::ggdendrogram(hclust(dist_matrix)) +
  ggtitle("Proportion overlap of ellipses, averaging across prompts")

Overall, the overlap and cohesion measures are very similiar.

L2ETS Study 1 analyses - Semantics

Molly Lewis

2017-10-04