Read in doctag indices, docvecs, and metadata

doctag_indices <- read_feather("../all_data/doctag_indices.feather") 
docvecs <- read_feather("../all_data/docvecs.feather") %>%
  as.data.frame() %>%
  mutate(offset = 0:(n()-1)) %>%
  select(offset, everything())

#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../all_data/merged_metadata.csv") 

Metadata

metadata_clean <- metadata %>%
  mutate_if(is.character, as.factor)  %>%
  mutate(essay_id = as.character(essay_id)) 

ggplot(metadata_clean, aes(x = age)) +
  geom_histogram() +
  ggtitle("Age distribution") +
  theme_minimal()

ggplot(metadata_clean, aes(x = score)) +
  geom_histogram(bins = 10) +
  ggtitle("Score distribution") +
  theme_minimal()

kable(count(metadata_clean, gender), caption = "gender distribution")
gender distribution
gender n
F 4573
M 6015
NA 1512
kable(count(metadata_clean, L1_code), caption = "L1 distribution")
L1 distribution
L1_code n
ARA 1100
CHI 1100
FRE 1100
GER 1100
HIN 1100
ITA 1100
JPN 1100
KOR 1100
SPA 1100
TEL 1100
TUR 1100
kable(count(metadata_clean, prompt_id), caption = "Prompt distribution")
Prompt distribution
prompt_id n
VC079857 1656
VC139555 1562
VC182427 1396
VC199494 1509
VC199744 1648
VC212639 960
VC247638 1686
VC251653 1683

Prompts

prompts = data.frame()
prompt_files <- list.files("../all_data/prompts/")
 for (i in 1:length(prompt_files)){
   fileName <- paste0("../all_data/prompts/", prompt_files[i])
   prompt <- readChar(fileName, file.info(fileName)$size)
   prompts <- bind_rows(prompts, 
                        data.frame(prompt_id = prompt_files[i], 
                                   prompt = prompt))
 }
prompts_clean <- prompts %>%
  rowwise() %>%
  mutate(prompt_id = str_replace_all(prompt_id, ".txt", ""),
         prompt = str_trim(str_replace_all(prompt, "\n", " ")),
         prompt = str_replace_all(prompt, "Use reasons and examples to support your answer.", ""),
         prompt = str_replace_all(prompt, "Use specific reasons and examples to support your answer.", ""),
         prompt = str_replace_all(prompt, "Do you agree or disagree with the following statement\\?", ""))
kable(prompts_clean)
prompt_id prompt
VC079857 It is better to have broad knowledge of many academic subjects than to specialize in one specific subject.
VC139555 Young people enjoy life more than older people do.
VC182427 Young people nowadays do not give enough time to helping their communities.
VC199494 Most advertisements make products seem much better than they really are.
VC199744 In twenty years, there will be fewer cars in use than there are today.
VC212639 The best way to travel is in a group led by a tour guide.
VC247638 It is more important for students to understand ideas and concepts than it is for them to learn facts.
VC251653 Successful people try new things and take risks rather than only doing what they already know how to do well.

Merge all data sources together

d <- doctag_indices %>%
  left_join(metadata_clean) %>%
  left_join(docvecs) %>%
  nest(-1:-10, .key = "doc_vector") %>%
  select(-doc_count, -offset, -test_center_country_code) %>%
  mutate(prompt_id = fct_recode(prompt_id, 
                                broad_academics = "VC079857", 
                                young_enjoy = "VC139555", 
                                young_help = "VC182427",
                                advertisements = "VC199494",
                                fewer_cars = "VC199744",
                                tour_travel = "VC212639",
                                students_facts = "VC247638",
                                success_risks = "VC251653"))

High-d space

get_group_centroid <- function(doc_vecs_df, group_name){
  centroid <- doc_vecs_df %>%
    select(doc_vector) %>%
    unnest(doc_vector) %>%
    colMeans() 

  this_group_name  <- doc_vecs_df %>%
    select_(group_name) %>%
    slice(1) %>%
    unlist()

  df <- data.frame(t(centroid)) %>%
    mutate(x = this_group_name) %>%
    select(x, everything())
  
   names(df)[1] = c(as.name(group_name))
   df
}
get_group_diameter <- function(doc_vecs_df, group_name){
  
  these_vecs <- doc_vecs_df %>%
    select(doc_vector) %>%
    unnest(doc_vector) %>%
    as.matrix()
  
  dist_sims <- dist2(these_vecs, method = "euclidean", norm = "none")  # get pairwise distances
  diag(dist_sims) <- NA # get pairwise distances 
  max_dist = max(as.data.frame(dist_sims), na.rm = T) # get max distances between points
  
  this_group_name <- doc_vecs_df %>%
    select_(group_name) %>%
    slice(1) %>%
    unlist()

  df <- data.frame(x = this_group_name,
             max_dist = max_dist)
  names(df)[1] = c(as.name(group_name))
  df
}

By language (pooling prompts)

Centroid distance

centroids <- d %>%
  split(.$L1_code) %>% 
  map_df(get_group_centroid, "L1_code")

centroid_mat = as.matrix(centroids[,-1])
rownames(centroid_mat) = centroids[,1]

ggdendro::ggdendrogram(hclust(dist(centroid_mat)), size = 2) +
  ggtitle("High-Dimensional Pairwise Centroid Distance, pooling across prompts")

#plot(hclust(dist(centroid_mat)))

Diameters

diameters <- d %>%
  split(.$L1_code) %>% 
  map_df(get_group_diameter, "L1_code") 

ggplot(diameters, aes(x = reorder(L1_code, max_dist), 
                      y = max_dist, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  xlab("L1") +
  ggtitle("Diameter") +
  theme_minimal() +
  theme(legend.position = "none") 

#plot(ape::as.phylo(hclust(dist(centroid_mat))), type = "unrooted")

By language and prompt (averaging across prompts)

Centroid distance

centroids_prompt <- d %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_centroid, "L1_prompt") %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

prompt_mean_centroids <- centroids_prompt %>%
  split(.$L1_code) %>% 
  map_df(~colMeans(.[,-1:-2])) %>%
  t() %>% 
  as.data.frame() %>%
  rownames_to_column("L1_code")

prompt_mean_centroids_mat = as.matrix(prompt_mean_centroids[,-1])
rownames(prompt_mean_centroids_mat) = prompt_mean_centroids[,1]

ggdendro::ggdendrogram(hclust(dist(prompt_mean_centroids_mat))) +
  ggtitle("High-Dimensional Pairwise Centroid Distance, averaging across prompts")

Diameters

diameters_prompt <- d %>%
    mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_diameter, "L1_prompt")  %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

prompt_mean_diameter<- diameters_prompt %>%
  group_by(L1_code)  %>%
  summarize(mean = mean(max_dist))
  #multi_boot_standard(column = "max_dist")

ggplot(prompt_mean_diameter, aes(x = reorder(L1_code, mean), 
                      y = mean, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  #geom_linerange(aes(ymin = summary_ci_lower, ymax = summary_ci_upper)) +
  ylab("diameter") +
  xlab("L1") +
  ggtitle("Diameter, averaging across prompts") +
  theme_minimal() +
  theme(legend.position = "none") 

2D Space

Visualize Distributions

mats <- d %>%
  select(doc_vector) %>%
  unnest() %>%
  as.matrix()

tsne_out = Rtsne::Rtsne(mats)
tsne_dims <- tsne_out$Y %>%
  as.data.frame() %>%
  rename(tsne_X = V1,
         tsne_Y = V2) %>%
  bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
  select(everything(), tsne_X, tsne_Y)

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
  geom_point(size = .2) +
  theme_minimal()

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = prompt_id, fill = prompt_id)) +
  #geom_encircle(alpha = .2) +
  #geom_point(size = .2) +
  geom_density2d(alpha=.5) + 
  theme_minimal()

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = prompt_id, fill = prompt_id)) +
  facet_grid(prompt_id ~L1_code ) +
  geom_encircle(alpha = .2) +
  theme_minimal()

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code, fill = L1_code)) +
  facet_wrap(~prompt_id) +
  geom_encircle(alpha = .2) +
  theme_minimal()

By language (pooling prompts)

Centroid Distances

tsne_centroids <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  split(.$L1_code) %>% 
  map_df(get_group_centroid, "L1_code")

#ggplot(tsne_centroids, aes(x = tsne_X, y = tsne_Y, fill = L1_code)) +
#  geom_label(aes(label = L1_code)) +
#  theme_minimal() +
#  theme(legend.position = "none") 

ggplot(tsne_centroids) +
geom_point(aes(x = tsne_X, y = tsne_Y), size = 4, color = 'black') +
  ggrepel::geom_label_repel(
    aes(tsne_X, y = tsne_Y, fill = L1_code, label = L1_code),
    fontface = 'bold', color = 'black',
    box.padding = unit(0.35, "lines"),
    point.padding = unit(0.2, "lines"),
    segment.color = 'black') +
  ggtitle("Centroids in 2D") +
  theme_minimal() +
  theme(legend.position = "none") 

Diameters

tsne_diameters <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  split(.$L1_code) %>% 
  map_df(get_group_diameter, "L1_code")  

ggplot(tsne_diameters, aes(x = reorder(L1_code, max_dist), 
                      y = max_dist, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  xlab("L1") +
  ggtitle("Diameter") +
  theme_minimal() +
  theme(legend.position = "none")

By language and prompt (averaging across prompts)

Centroid distances

tsne_centroids_by_prompt <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_centroid, "L1_prompt") %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_") 

tsne_prompt_mean_centroids <- tsne_centroids_by_prompt %>%
  split(.$L1_code) %>% 
  map_df(~colMeans(.[,-1:-2])) %>%
  t() %>% 
  as.data.frame() %>%
  rownames_to_column("L1_code") %>%
  rename(tsne_X = V1,
         tsne_Y = V2)

ggplot(tsne_centroids_by_prompt, aes(x = tsne_X, y = tsne_Y)) +
 geom_point(aes(color = L1_code)) +
  facet_wrap( ~prompt_id, scales = "free") +
  theme_minimal() 

ggplot(tsne_prompt_mean_centroids) +
geom_point(aes(x = tsne_X, y = tsne_Y), size = 4, color = 'black') +
  ggrepel::geom_label_repel(
    aes(tsne_X, y = tsne_Y, fill = L1_code, label = L1_code),
    fontface = 'bold', color = 'black',
    box.padding = unit(0.35, "lines"),
    point.padding = unit(0.2, "lines"),
    segment.color = 'black') +
  ggtitle("Centroids in 2D, averaging across prompts") +
  theme_minimal() +
  theme(legend.position = "none") 

Diameters

tsne_diameters_prompt <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_diameter, "L1_prompt")  %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

tsne_prompt_mean_diameter<- tsne_diameters_prompt %>%
  group_by(L1_code) %>%
  summarize(mean = mean(max_dist))
  #multi_boot_standard(column = "max_dist")

ggplot(tsne_prompt_mean_diameter, aes(x = reorder(L1_code, mean), 
                      y = mean, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  #geom_linerange(aes(ymin = summary_ci_lower, ymax = summary_ci_upper)) +
  
  xlab("L1") +
  ggtitle("Diameter, averaging across prompts") +
  theme_minimal() +
  theme(legend.position = "none") 

Area overlap in 2D

pooling across prompts

get_2d_overlap <- function(group_1, group_2, df){
  df1 <- filter(df, L1_code == group_1)
  df2 <- filter(df, L1_code == group_2)

  siar_obj = siar::overlap(df1$tsne_X,df1$tsne_Y, df2$tsne_X, df2$tsne_Y)
  overlap = siar_obj$overlap
  p_overlap = overlap/mean(siar_obj$area1, siar_obj$area2) # overlaps normalized by mean area
  
  data.frame(group_1  = group_1,
             group_2 = group_2,
             overlap = overlap, 
             p_overlap = p_overlap)
}

all_lang_combos <- expand.grid(group_1 = as.character(unique(tsne_dims$L1_code)), 
                               group_2 = as.character(unique(tsne_dims$L1_code)))

pairwise_tsne_overlap <- map2_df(all_lang_combos$group_1, 
                                 all_lang_combos$group_2,
                                 get_2d_overlap, 
                                 tsne_dims)

pairwise_tsne_overlap_dist <- pairwise_tsne_overlap %>%
  mutate(dist_p_overlap = 1 - p_overlap) %>%
  select(group_1, group_2, dist_p_overlap) %>%
  spread(group_2,dist_p_overlap) %>%
  remove_rownames() %>%
  column_to_rownames("group_1") %>%
  as.matrix()

prompt_mean_centroids_mat <- as.matrix(prompt_mean_centroids[,-1])
rownames(prompt_mean_centroids_mat) = prompt_mean_centroids[,1]

ggdendro::ggdendrogram(hclust(dist(pairwise_tsne_overlap_dist))) +
  ggtitle("Proportion overlap of ellipses, pooling across prompts")

averaging across prompts

get_2d_overlap_prompt <- function(group_1, group_2, prompt, df){
   df1 <- filter(df, L1_code == group_1, prompt_id == prompt)
  df2 <- filter(df, L1_code == group_2, prompt_id == prompt)

  siar_obj = siar::overlap(df1$tsne_X,df1$tsne_Y, df2$tsne_X, df2$tsne_Y)
  overlap = siar_obj$overlap
  p_overlap = overlap/mean(siar_obj$area1, siar_obj$area2) # overlaps normalized by mean area
  
  data.frame(group_1  = group_1,
             group_2 = group_2,
             prompt = prompt,
             overlap = overlap, 
             p_overlap = p_overlap)
}

all_prompts <- rep(unique(tsne_dims$prompt_id), each = nrow(all_lang_combos))

all_lang_combos2 <- do.call("rbind", replicate(length(unique(tsne_dims$prompt_id)), 
                                               all_lang_combos, simplify = FALSE)) %>%
                    bind_cols(prompt = all_prompts) %>%
  mutate_all(funs(as.character))


pairwise_tsne_overlap_prompt <- pmap_df(all_lang_combos2,
                                 get_2d_overlap_prompt, 
                                 tsne_dims)

pairwise_tsne_overlap_prompt_mat <- pairwise_tsne_overlap_prompt %>%
  group_by(group_1, group_2) %>%
  summarize(p_overlap = mean(p_overlap)) %>%
  mutate(dist_p_overlap = 1 - p_overlap) %>%
  select(group_1, group_2, dist_p_overlap) %>%
  spread(group_2,dist_p_overlap) %>%
  remove_rownames() %>%
  column_to_rownames("group_1") %>%
  as.matrix()

pairwise_tsne_overlap_prompt_mat <- as.matrix(pairwise_tsne_overlap_prompt_mat)

ggdendro::ggdendrogram(hclust(dist(pairwise_tsne_overlap_prompt_mat))) +
  ggtitle("Proportion overlap of ellipses, averaging across prompts")

Prop. overlap language matrix

diag(pairwise_tsne_overlap_prompt_mat) <- ""
pairwise_tsne_overlap_prompt_mat[upper.tri(pairwise_tsne_overlap_prompt_mat)] <- ""
kable(pairwise_tsne_overlap_prompt_mat)
ARA CHI FRE GER HIN ITA JPN KOR SPA TEL TUR
ARA
CHI 0.247233837870026
FRE 0.17785820751051 0.27269483865517
GER 0.19261301405023 0.305131384258742 0.184189439645891
HIN 0.678398161107181 0.688971399134594 0.654901764079375 0.65904866225239
ITA 0.0984127614333606 0.245789748926822 0.224353716968205 0.416436601355935 0.749772502363741
JPN 0.324830619217136 0.293185999947818 0.495341862084987 0.623535937718317 0.771661993550938 0.482842510859474
KOR 0.331923593561158 0.262751623404783 0.476213924165527 0.606124049174115 0.788831165987201 0.511814612841325 0.238487156149559
SPA 0.100746291739933 0.267401745495666 0.24146112922371 0.369292297402464 0.729251179619222 0.327300714079131 0.395852142654985 0.354162721651465
TEL 0.641150469122112 0.660124196884375 0.642619787547842 0.62856410984248 0.282475673128749 0.803794470438355 0.74160105479356 0.67200881345187 0.699333507548088
TUR 0.241883027472553 0.324921790304212 0.315397794477124 0.385279892055217 0.659447705171416 0.478343128044118 0.405953568396441 0.370423528949248 0.33022417608558 0.671758335611655

Prompts sorted by overlap

pairwise_tsne_overlap_prompt %>%
  rename(prompt_id = prompt) %>%
  group_by(prompt_id) %>%
  summarize(mean = mean(p_overlap)) %>%
  arrange(mean) %>%
  left_join(prompts_clean) %>%
  kable()
prompt_id mean prompt
VC079857 0.5167640 It is better to have broad knowledge of many academic subjects than to specialize in one specific subject.
VC139555 0.5376194 Young people enjoy life more than older people do.
VC182427 0.5438269 Young people nowadays do not give enough time to helping their communities.
VC199744 0.5493628 In twenty years, there will be fewer cars in use than there are today.
VC212639 0.5780431 The best way to travel is in a group led by a tour guide.
VC247638 0.6274587 It is more important for students to understand ideas and concepts than it is for them to learn facts.
VC251653 0.6696384 Successful people try new things and take risks rather than only doing what they already know how to do well.
VC199494 0.6931459 Most advertisements make products seem much better than they really are.