This analysis looks at the location of an essay in its native semantic space vs. a non-native semantic space. The prediction is that an essay should be closer to the prompt in its native space. We look at German, Hindi, Korean, and Turkish for the “broad academics” prompt (“VC079857”).

Set params.

PROMPT <- "VC079857"
LANGS <- c("ENG", "GER", "HIN", "TUR", "KOR")

Read in model data

all_models <- read_feather("data/all_langs_word_vectors.feather")
  
all_models_clean <- all_models  %>%
                        select(-type, -translation) %>% 
                        data.table(key = "word")

unique_words <- unique(all_models$word)

Read in essay data

essay_word_counts <- read_feather("data/essay_word_counts_cleaned.feather") %>%
  rename(word = word_cleaned) %>%
  arrange(essay_id) %>%
  filter(word %in% unique_words, # we only care about the top N words
         prompt_id == PROMPT) %>%
  data.table(key = "word")

unique_essays <- unique(essay_word_counts$essay_id)

Get essay coordinates in semantic space of each language

essay_lang_combos <- expand.grid(unique_essays, LANGS) %>%
                            rename(essay_id = Var1, 
                                   L1_code = Var2) %>%
                            mutate_all(as.character)

essays_in_all_langs <- map2_df(essay_lang_combos$essay_id,  
                                  essay_lang_combos$L1_code,
                                  get_essay_location, 
                                  all_models_clean, 
                                  essay_word_counts)

Get prompt coordinates in semantic space of each language

prompt_path <- paste0("data/", PROMPT, "_prompt_word_counts.feather")

prompt_counts <- read_feather(prompt_path) %>%
                        rename(word = word_cleaned,
                                         count = n) %>%
                        mutate(essay_id = 'prompt') %>%
                        data.table(.key = "word")

prompts_in_all_langs <- map2_df(rep("prompt", length(LANGS)),
                            LANGS, 
                            get_essay_location, 
                            all_models_clean, 
                            prompt_counts)

Get TSNE coordinates (separately for each language)

# merge together prompts and essays
prompts_and_essays <- bind_rows(essays_in_all_langs, prompts_in_all_langs)

# Get TSNE coordinates for all essays in each languaage
essays_in_all_langs_2D <- map_df(LANGS, get_tsne_by_lang, prompts_and_essays)

#write_csv(essays_in_all_langs_2D, "../../all_data/tsne_coords_all_essays_cached.csv")

Read in cached data

essays_in_all_langs_2D <- read_csv("../../all_data/tsne_coords_all_essays_cached.csv",
                                   col_types = list(col_double(), 
                                                    col_double(), 
                                                    col_character(),
                                                    col_character()))

Merge back in metadata to essay and prompt coordinates

metadata_path <- "../../all_data/raw/merged_metadata.csv"
metadata_clean <- read_csv(metadata_path) %>%
  mutate_if(is.character, as.factor)  %>%
  mutate(essay_id = as.character(essay_id))

essay_data <- essays_in_all_langs_2D %>% #essays_in_all_langs_2D_with_meta
                  rename(model_lang_code = L1_code) %>%
                  left_join(metadata_clean %>% select(L1_code,
                                                      score, 
                                                      essay_id), by = "essay_id") %>%
                  mutate(score_bin = ifelse(score > 3, "high", "low"), 
                         score_bin = fct_rev(as.factor(score_bin))) %>%  # add score factor
                  filter(essay_id != "prompt",
                         L1_code %in% LANGS)
  
prompt_data <- essays_in_all_langs_2D %>%
  filter(essay_id == "prompt") %>%
  rename(model_lang_code = L1_code)

Results

In the plots below, each small point is an essay. The larger colored points are language centroids. And, the cross is the prompt centroid.

Plots by language

lang_plot(essay_data, prompt_data, "ENG",  essays = T, labels = F)

lang_plot(essay_data, prompt_data, "HIN",  essays = T, labels = F)

lang_plot(essay_data, prompt_data, "KOR",  essays = T, labels = F)

lang_plot(essay_data, prompt_data, "TUR",  essays = T, labels = F)

lang_plot(essay_data, prompt_data, "GER",  essays = T, labels = F)

Plots by scores

score_plot(essay_data, prompt_data, "ENG")

score_plot(essay_data, prompt_data, "HIN")

score_plot(essay_data, prompt_data, "KOR")

score_plot(essay_data, prompt_data, "TUR")

score_plot(essay_data, prompt_data, "GER")

Plots by language and scores

score_lang_plot(essay_data, prompt_data, "ENG", essays = T, labels = F)

score_lang_plot(essay_data, prompt_data, "HIN",  essays = T, labels = F)

score_lang_plot(essay_data, prompt_data, "KOR",  essays = T, labels = F)

score_lang_plot(essay_data, prompt_data, "TUR",  essays = T, labels = F)

score_lang_plot(essay_data, prompt_data, "GER",  essays = T, labels = F)

Prompt-essay centroid distances

Column facets correspond to difference language embedding models. Row facets correspond to high vs. low scoring essays. Prediction: Smaller distance when L1_lang == semantic_lang. Holds, except for Turkish.

prompt_dists <- map_df(LANGS, get_prompt_lang_centroid_score_centroid_dist, 
                       essay_data, prompt_data)

ggplot(prompt_dists, aes(x = L1_code, y = dist, fill = L1_code)) +
  geom_bar(stat = "identity") +
  xlab("L1 of essay writer") +
  ylab("prompt-essay centroid dist") +
  ggtitle("Distance between essay and prompt centroids") +
  facet_grid(score_bin~model_L1_code)+
  theme_bw()

Low essays only in log space:

ggplot(filter(prompt_dists, score_bin == "low"),
       aes(x=L1_code, y = log(dist), fill = L1_code)) +
  geom_bar(stat = "identity") +
  xlab("L1 of essay writer") +
  ylab("log prompt-essay centroid dist") +
  ggtitle("Distance between essay and prompt centroids for low essays only") +
  facet_grid(score_bin~model_L1_code) +
  theme_bw() +
  theme(axis.text.x = element_text(angle=90))

I did the same analysis as above, but included essays written by all L1s, and things look pretty sensible: