Are languages closer to the prompt in their native language model? Unweighted essay locations.
GOOGLE_LANGS <- c("ar", "bn", "bg", "zh", "nl", "fr", "de", "el", "en",
"gu", "hi", "ig", "id", "it", "ja", "kn", "ko",
"ml", "mr", "ne", "pa", "pl", "pt", "ro", "ru",
"es", "tl", "ta", "te", "th", "tr", "ur", "vi", "yo",
"fa")
Read in essay metadata
metadata <- read_csv("../../../data/raw/merged_metadata.csv") %>%
mutate(essay_id = as.character(essay_id)) %>%
mutate(score_bin = ifelse(score < 4, "low", "high"))
Read in essay locations and merge in metadata
essay_path <- "../../../data/processed/translated_essay_vectors/"
files <- list.files(essay_path)[c(-30, -32)]
essays_vectors <- map_df(files, function(x){read_feather(paste0(essay_path,x))}) %>%
data.table()
essays_vectors_unweighted <- essays_vectors[word_weighting == "unweighted"] %>%
merge(metadata %>% select(essay_id, prompt_id, score_bin),
all.x = TRUE)
setcolorder(essays_vectors_unweighted,
c(1:3, 305:306, 4:304))
Read in prompt locations
prompt_centroids <- read_csv("../../../data/processed/prompt_translations/native_prompt_embeddings_study2.csv")
prompt_ids <- unique(prompt_centroids$prompt_id)
Get centroids of each essay_lang x prompt x model_lang
ixs <- expand.grid(prompt_id = prompt_ids,
model_lang = GOOGLE_LANGS[-9], # english is missing
essay_lang = GOOGLE_LANGS,
stringsAsFactors = FALSE)
all_essay_centroids <- pmap_df(list(ixs$prompt_id, ixs$model_lang, ixs$essay_lang),
get_lang_mod_centroids,
essays_vectors_unweighted)
#write_feather(all_essay_centroids, "../../../data/processed/translated_essay_centroids/translated_essay_centroids_all_unweighted.feather")
all_essay_centroids <- read_feather("../../../data/processed/translated_essay_centroids/translated_essay_centroids_all_unweighted.feather") %>%
data.table()
For each essay_lang x prompt x model_lang centroid, get distance from prompt
all_essay_distances_from_prompt <- pmap_df(
list(ixs$prompt_id, ixs$model_lang, ixs$essay_lang),
get_dist_from_prompt,
all_essay_centroids,
prompt_centroids)
#write_feather(all_essay_distances_from_prompt, "../../../data/processed/translated_essay_centroids/translated_prompt_dists_all_unweighted.feather")
all_essay_distances_from_prompt <- read_feather("../../../data/processed/translated_essay_centroids/translated_prompt_dists_all_unweighted.feather")
Plot distance from prompt
dist_means <- all_essay_distances_from_prompt %>%
filter(model_lang != "ja",
model_lang != "th",
model_lang != "zh") %>%
group_by(model_lang, essay_lang) %>%
summarize(mean = mean(cosine_dist),
abs_mean = mean(abs(cosine_dist)))
ggplot(dist_means, aes(x = mean)) +
geom_histogram() +
facet_wrap(~model_lang, scales = "free")
dist_means %>%
group_by(model_lang) %>%
arrange(-mean) %>%
slice(1:2) %>%
as.data.frame() %>%
kable()
model_lang | essay_lang | mean | abs_mean |
---|---|---|---|
ar | ja | 0.8851479 | 0.8851479 |
ar | fa | 0.8844884 | 0.8844884 |
bg | vi | 0.9106663 | 0.9106663 |
bg | es | 0.9106343 | 0.9106343 |
bn | ja | 0.9058845 | 0.9058845 |
bn | ko | 0.9054729 | 0.9054729 |
de | ko | 0.9346251 | 0.9346251 |
de | vi | 0.9338602 | 0.9338602 |
el | fa | 0.9008149 | 0.9008149 |
el | te | 0.9001061 | 0.9001061 |
es | yo | 0.9346893 | 0.9346893 |
es | ig | 0.9346745 | 0.9346745 |
fa | ko | 0.9387155 | 0.9387155 |
fa | th | 0.9384138 | 0.9384138 |
fr | zh | 0.9370297 | 0.9370297 |
fr | ja | 0.9369826 | 0.9369826 |
gu | ja | 0.8907143 | 0.8907143 |
gu | vi | 0.8900985 | 0.8900985 |
hi | ko | 0.9328379 | 0.9328379 |
hi | th | 0.9326936 | 0.9326936 |
id | ja | 0.9441595 | 0.9441595 |
id | ko | 0.9422861 | 0.9422861 |
ig | pl | 0.9885578 | 0.9885578 |
ig | ko | 0.9884597 | 0.9884597 |
it | ko | 0.9254831 | 0.9254831 |
it | vi | 0.9239440 | 0.9239440 |
kn | ko | 0.8464824 | 0.8464824 |
kn | ja | 0.8460231 | 0.8460231 |
ko | ja | 0.8579011 | 0.8579011 |
ko | ko | 0.8574635 | 0.8574635 |
ml | ja | 0.7896742 | 0.7896742 |
ml | vi | 0.7893546 | 0.7893546 |
mr | ja | 0.8544361 | 0.8544361 |
mr | ko | 0.8534093 | 0.8534093 |
ne | ko | 0.8693709 | 0.8693709 |
ne | vi | 0.8675273 | 0.8675273 |
nl | ja | 0.9532651 | 0.9532651 |
nl | vi | 0.9531085 | 0.9531085 |
pa | gu | 0.9426215 | 0.9426215 |
pa | zh | 0.9418346 | 0.9418346 |
pl | zh | 0.9044955 | 0.9044955 |
pl | id | 0.9039826 | 0.9039826 |
pt | ko | 0.9371789 | 0.9371789 |
pt | ig | 0.9367161 | 0.9367161 |
ro | ja | 0.9250079 | 0.9250079 |
ro | zh | 0.9245983 | 0.9245983 |
ru | id | 0.8984596 | 0.8984596 |
ru | vi | 0.8980968 | 0.8980968 |
ta | ko | 0.8425086 | 0.8425086 |
ta | ja | 0.8411394 | 0.8411394 |
te | ko | 0.8750108 | 0.8750108 |
te | ja | 0.8741812 | 0.8741812 |
tl | gu | 0.9397898 | 0.9397898 |
tl | en | 0.9388644 | 0.9388644 |
tr | ja | 0.8617532 | 0.8617532 |
tr | ko | 0.8590297 | 0.8590297 |
ur | ko | 0.9435368 | 0.9435368 |
ur | zh | 0.9434326 | 0.9434326 |
vi | ja | 0.9216203 | 0.9216203 |
vi | ru | 0.9194698 | 0.9194698 |
yo | gu | 0.9762915 | 0.9762915 |
yo | ne | 0.9762407 | 0.9762407 |
t-test
by_class <- dist_means %>%
mutate(group = ifelse(essay_lang == model_lang, "within", "across")) %>%
group_by(model_lang, group) %>%
summarize(mean = mean(mean)) %>%
spread(group, mean) %>%
mutate(diff = across - within)
t.test(by_class$diff, mu = 0)
##
## One Sample t-test
##
## data: by_class$diff
## t = 1.0232, df = 30, p-value = 0.3144
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.0004848394 0.0014585233
## sample estimates:
## mean of x
## 0.000486842
lmer model
all_essay_distances_from_prompt2 <- all_essay_distances_from_prompt %>%
mutate(group = ifelse(essay_lang == model_lang, "within", "across")) %>%
filter(model_lang != "ja",
model_lang != "th",
model_lang != "zh")
mod1 <- lmer(cosine_dist ~ group + (1|essay_lang) + (1|model_lang), all_essay_distances_from_prompt2)
summary(mod1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: cosine_dist ~ group + (1 | essay_lang) + (1 | model_lang)
## Data: all_essay_distances_from_prompt2
##
## REML criterion at convergence: -152589.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.7632 -0.5429 0.0566 0.6132 3.1232
##
## Random effects:
## Groups Name Variance Std.Dev.
## essay_lang (Intercept) 2.630e-06 0.001622
## model_lang (Intercept) 2.101e-03 0.045837
## Residual 3.812e-04 0.019525
## Number of obs: 30380, groups: essay_lang, 35; model_lang, 31
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.9040626 0.0082380 109.7
## groupwithin -0.0002688 0.0006735 -0.4
##
## Correlation of Fixed Effects:
## (Intr)
## groupwithin -0.002
Get centroids of each essay_lang x prompt x model_lang
essays_vectors_unweighted_low <- filter(essays_vectors_unweighted, score_bin == "low") %>%
data.table()
ixs <- expand.grid(prompt_id = prompt_ids,
model_lang = GOOGLE_LANGS[-9], # english is missing
essay_lang = GOOGLE_LANGS,
stringsAsFactors = FALSE)
low_essay_centroids <- pmap_df(list(ixs$prompt_id, ixs$model_lang, ixs$essay_lang),
get_lang_mod_centroids,
essays_vectors_unweighted_low)
#write_feather(low_essay_centroids, "../../../data/processed/translated_essay_centroids/translated_essay_centroids_low_unweighted.feather")
low_essay_centroids <- read_feather("../../../data/processed/translated_essay_centroids/translated_essay_centroids_low_unweighted.feather") %>%
data.table()
For each essay_lang x prompt x model_lang centroid, get distance from prompt
low_essay_distances_from_prompt <- pmap_df(
list(ixs$prompt_id, ixs$model_lang, ixs$essay_lang),
get_dist_from_prompt,
low_essay_centroids,
prompt_centroids)
#write_feather(low_essay_distances_from_prompt, "../../../data/processed/translated_essay_centroids/translated_prompt_dists_low_unweighted.feather")
low_essay_distances_from_prompt <- read_feather("../../../data/processed/translated_essay_centroids/translated_prompt_dists_low_unweighted.feather")
dist_means <- low_essay_distances_from_prompt %>%
filter(model_lang != "ja",
model_lang != "th",
model_lang != "zh") %>%
group_by(model_lang, essay_lang) %>%
summarize(mean = mean(cosine_dist),
abs_mean = mean(abs(cosine_dist)))
ggplot(dist_means, aes(x = mean)) +
geom_histogram() +
facet_wrap(~model_lang, scales = "free")
t-test
by_class <- dist_means %>%
mutate(group = ifelse(essay_lang == model_lang, "within", "across")) %>%
group_by(model_lang, group) %>%
summarize(mean = mean(mean, na.rm = TRUE)) %>%
spread(group, mean) %>%
mutate(diff = across-within)
t.test(by_class$across, by_class$within, paired = TRUE)
##
## Paired t-test
##
## data: by_class$across and by_class$within
## t = 1.1865, df = 25, p-value = 0.2466
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.0004547501 0.0016908420
## sample estimates:
## mean of the differences
## 0.0006180459
lmer model
low_essay_distances_from_prompt2 <- low_essay_distances_from_prompt %>%
mutate(group = ifelse(essay_lang == model_lang, "within", "across")) %>%
filter(model_lang != "ja",
model_lang != "th",
model_lang != "zh")
mod2 <- lmer(cosine_dist ~ group + (1|essay_lang) + (1|model_lang) + (1|prompt_id), low_essay_distances_from_prompt2)
summary(mod2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: cosine_dist ~ group + (1 | essay_lang) + (1 | model_lang) + (1 |
## prompt_id)
## Data: low_essay_distances_from_prompt2
##
## REML criterion at convergence: -162164.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.2938 -0.5246 0.0706 0.5953 3.6269
##
## Random effects:
## Groups Name Variance Std.Dev.
## essay_lang (Intercept) 3.683e-06 0.001919
## model_lang (Intercept) 2.068e-03 0.045473
## prompt_id (Intercept) 1.298e-04 0.011393
## Residual 2.659e-04 0.016306
## Number of obs: 30163, groups:
## essay_lang, 35; model_lang, 31; prompt_id, 28
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.9038494 0.0084529 106.9
## groupwithin -0.0002819 0.0005648 -0.5
##
## Correlation of Fixed Effects:
## (Intr)
## groupwithin -0.002