Here, we look at the difference in the distributions across languages with mean and variance in original (200) dimensionality.

0.1 Centroids

0.1.1 Distances

Pairwise distances between languages

centroids_prompt <- d %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_centroid, "L1_prompt") %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

prompt_mean_centroids <- centroids_prompt %>%
  split(.$L1_code) %>% 
  map_df(~colMeans(.[,-1:-2])) %>%
  t() %>% 
  as.data.frame() %>%
  rownames_to_column("L1_code")

prompt_mean_centroids_mat = as.matrix(prompt_mean_centroids[,-1])
rownames(prompt_mean_centroids_mat) = prompt_mean_centroids[,1]

dist_matrix <- dist(prompt_mean_centroids_mat)
dist_matrix_kable <- as.matrix(dist_matrix)
dist_matrix_kable[upper.tri(dist_matrix_kable)] <- NA
kable(as.data.frame(dist_matrix_kable), digits = 2)

	ARA	CHI	FRE	GER	HIN	ITA	JPN	KOR	SPA	TEL	TUR
ARA	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA	NA
CHI	1.44	0.00	NA	NA	NA	NA	NA	NA	NA	NA	NA
FRE	1.61	1.45	0.00	NA	NA	NA	NA	NA	NA	NA	NA
GER	1.87	1.76	1.17	0.00	NA	NA	NA	NA	NA	NA	NA
HIN	2.02	2.20	2.14	2.03	0.00	NA	NA	NA	NA	NA	NA
ITA	1.71	1.70	1.08	1.44	2.44	0.00	NA	NA	NA	NA	NA
JPN	1.62	1.10	1.65	1.99	2.59	1.70	0.00	NA	NA	NA	NA
KOR	1.65	1.04	1.68	2.01	2.44	1.80	0.73	0.00	NA	NA	NA
SPA	1.27	1.44	0.96	1.31	2.08	1.08	1.61	1.66	0.00	NA	NA
TEL	1.84	2.07	2.32	2.30	0.94	2.53	2.43	2.29	2.18	0.00	NA
TUR	1.19	1.19	1.22	1.50	1.86	1.47	1.40	1.36	1.12	1.84	0

ggdendro::ggdendrogram(hclust(dist_matrix)) +
  ggtitle("High-Dimensional Pairwise Centroid Distance, averaging across prompts")

write_csv(as.data.frame(dist_matrix_kable), "../../all_data/pairwise_country_distances/HD_centroid_distances.csv")

1 Variance

Diameter is average pairwise distance between within-language essays in 2D-space.

d_diameters_prompt <- d %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_diameter, "L1_prompt", "mean_dist")  %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

d_prompt_mean_diameter <- d_diameters_prompt %>%
  group_by(L1_code) %>%
  multi_boot_standard(col = "diameter")

ggplot(d_prompt_mean_diameter, aes(x = reorder(L1_code, mean), 
                      y = mean, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  xlab("L1") +
  ggtitle("Diameter, averaging across prompts") +
  theme_minimal() +
  theme(legend.position = "none")

L2ETS Study 1 analyses (HD) - Semantics

Molly Lewis

2017-12-03

0.1 Centroids

0.1.1 Distances

1 Variance