This analysis looks at the location of an essay in its native semantic space vs. a non-native semantic space. The prediction is that an essay should be closer to the prompt in its native space. We look at German, Hindi, Korean, and Turkish for the “broad academics” prompt (“VC079857”).
Set params.
PROMPT <- "VC079857"
LANGS <- c("ENG", "GER", "HIN", "TUR", "KOR")
Read in model data
all_models <- read_feather("data/all_langs_word_vectors.feather")
all_models_clean <- all_models %>%
select(-type, -translation) %>%
data.table(key = "word")
unique_words <- unique(all_models$word)
Read in essay data
essay_word_counts <- read_feather("data/essay_word_counts_cleaned.feather") %>%
rename(word = word_cleaned) %>%
arrange(essay_id) %>%
filter(word %in% unique_words, # we only care about the top N words
prompt_id == PROMPT) %>%
data.table(key = "word")
unique_essays <- unique(essay_word_counts$essay_id)
Get essay coordinates in semantic space of each language
essay_lang_combos <- expand.grid(unique_essays, LANGS) %>%
rename(essay_id = Var1,
L1_code = Var2) %>%
mutate_all(as.character)
essays_in_all_langs <- map2_df(essay_lang_combos$essay_id,
essay_lang_combos$L1_code,
get_essay_location,
all_models_clean,
essay_word_counts)
Get prompt coordinates in semantic space of each language
prompt_path <- paste0("data/", PROMPT, "_prompt_word_counts.feather")
prompt_counts <- read_feather(prompt_path) %>%
rename(word = word_cleaned,
count = n) %>%
mutate(essay_id = 'prompt') %>%
data.table(.key = "word")
prompts_in_all_langs <- map2_df(rep("prompt", length(LANGS)),
LANGS,
get_essay_location,
all_models_clean,
prompt_counts)
Get TSNE coordinates (separately for each language)
# merge together prompts and essays
prompts_and_essays <- bind_rows(essays_in_all_langs, prompts_in_all_langs)
# Get TSNE coordinates for all essays in each languaage
essays_in_all_langs_2D <- map_df(LANGS, get_tsne_by_lang, prompts_and_essays)
#write_csv(essays_in_all_langs_2D, "../../all_data/tsne_coords_all_essays_cached.csv")
Read in cached data
essays_in_all_langs_2D <- read_csv("../../all_data/tsne_coords_all_essays_cached.csv",
col_types = list(col_double(),
col_double(),
col_character(),
col_character()))
Merge back in metadata to essay and prompt coordinates
metadata_path <- "../../all_data/raw/merged_metadata.csv"
metadata_clean <- read_csv(metadata_path) %>%
mutate_if(is.character, as.factor) %>%
mutate(essay_id = as.character(essay_id))
essay_data <- essays_in_all_langs_2D %>% #essays_in_all_langs_2D_with_meta
rename(model_lang_code = L1_code) %>%
left_join(metadata_clean %>% select(L1_code,
score,
essay_id), by = "essay_id") %>%
mutate(score_bin = ifelse(score > 3, "high", "low"),
score_bin = fct_rev(as.factor(score_bin))) %>% # add score factor
filter(essay_id != "prompt",
L1_code %in% LANGS)
prompt_data <- essays_in_all_langs_2D %>%
filter(essay_id == "prompt") %>%
rename(model_lang_code = L1_code)
In the plots below, each small point is an essay. The larger colored points are language centroids. And, the cross is the prompt centroid.
lang_plot(essay_data, prompt_data, "ENG", essays = T, labels = F)
lang_plot(essay_data, prompt_data, "HIN", essays = T, labels = F)
lang_plot(essay_data, prompt_data, "KOR", essays = T, labels = F)
lang_plot(essay_data, prompt_data, "TUR", essays = T, labels = F)
lang_plot(essay_data, prompt_data, "GER", essays = T, labels = F)
score_plot(essay_data, prompt_data, "ENG")
score_plot(essay_data, prompt_data, "HIN")
score_plot(essay_data, prompt_data, "KOR")
score_plot(essay_data, prompt_data, "TUR")
score_plot(essay_data, prompt_data, "GER")
score_lang_plot(essay_data, prompt_data, "ENG", essays = T, labels = F)
score_lang_plot(essay_data, prompt_data, "HIN", essays = T, labels = F)
score_lang_plot(essay_data, prompt_data, "KOR", essays = T, labels = F)
score_lang_plot(essay_data, prompt_data, "TUR", essays = T, labels = F)
score_lang_plot(essay_data, prompt_data, "GER", essays = T, labels = F)
Column facets correspond to difference language embedding models. Row facets correspond to high vs. low scoring essays. Prediction: Smaller distance when L1_lang == semantic_lang. Holds, except for Turkish.
prompt_dists <- map_df(LANGS, get_prompt_lang_centroid_score_centroid_dist,
essay_data, prompt_data)
ggplot(prompt_dists, aes(x = L1_code, y = dist, fill = L1_code)) +
geom_bar(stat = "identity") +
xlab("L1 of essay writer") +
ylab("prompt-essay centroid dist") +
ggtitle("Distance between essay and prompt centroids") +
facet_grid(score_bin~model_L1_code)+
theme_bw()
Low essays only in log space:
ggplot(filter(prompt_dists, score_bin == "low"),
aes(x=L1_code, y = log(dist), fill = L1_code)) +
geom_bar(stat = "identity") +
xlab("L1 of essay writer") +
ylab("log prompt-essay centroid dist") +
ggtitle("Distance between essay and prompt centroids for low essays only") +
facet_grid(score_bin~model_L1_code) +
theme_bw() +
theme(axis.text.x = element_text(angle=90))
I did the same analysis as above, but included essays written by all L1s, and things look pretty sensible: