L2ETS Study 2

ALL ESSAYS
LOW ESSAYS

Are languages closer to the prompt in their native language model? Unweighted essay locations.

GOOGLE_LANGS <- c("ar", "bn", "bg", "zh", "nl",  "fr", "de", "el", "en",
                  "gu", "hi", "ig", "id", "it", "ja", "kn", "ko",
                  "ml", "mr", "ne", "pa", "pl", "pt", "ro", "ru", 
                  "es", "tl", "ta", "te", "th", "tr", "ur", "vi", "yo",
                  "fa")

Read in essay metadata

metadata <- read_csv("../../../data/raw/merged_metadata.csv")  %>%
  mutate(essay_id = as.character(essay_id)) %>%
  mutate(score_bin = ifelse(score < 4, "low", "high"))

Read in essay locations and merge in metadata

essay_path <- "../../../data/processed/translated_essay_vectors/"
files <- list.files(essay_path)[c(-30, -32)]

essays_vectors <- map_df(files, function(x){read_feather(paste0(essay_path,x))}) %>%
  data.table()

essays_vectors_unweighted <- essays_vectors[word_weighting == "unweighted"] %>%
  merge(metadata %>% select(essay_id, prompt_id, score_bin),
         all.x = TRUE)

setcolorder(essays_vectors_unweighted, 
            c(1:3, 305:306, 4:304))

Read in prompt locations

prompt_centroids <- read_csv("../../../data/processed/prompt_translations/native_prompt_embeddings_study2.csv")
prompt_ids <- unique(prompt_centroids$prompt_id)

Get centroids of each essay_lang x prompt x model_lang

ixs  <- expand.grid(prompt_id = prompt_ids,
                    model_lang = GOOGLE_LANGS[-9],  # english is missing
                    essay_lang = GOOGLE_LANGS,
                    stringsAsFactors = FALSE) 

all_essay_centroids <- pmap_df(list(ixs$prompt_id, ixs$model_lang, ixs$essay_lang), 
     get_lang_mod_centroids, 
     essays_vectors_unweighted)


#write_feather(all_essay_centroids, "../../../data/processed/translated_essay_centroids/translated_essay_centroids_all_unweighted.feather")

all_essay_centroids <- read_feather("../../../data/processed/translated_essay_centroids/translated_essay_centroids_all_unweighted.feather") %>%
  data.table()

ALL ESSAYS

For each essay_lang x prompt x model_lang centroid, get distance from prompt

all_essay_distances_from_prompt <- pmap_df(
     list(ixs$prompt_id, ixs$model_lang, ixs$essay_lang), 
     get_dist_from_prompt, 
     all_essay_centroids,
     prompt_centroids)

#write_feather(all_essay_distances_from_prompt, "../../../data/processed/translated_essay_centroids/translated_prompt_dists_all_unweighted.feather")

all_essay_distances_from_prompt <- read_feather("../../../data/processed/translated_essay_centroids/translated_prompt_dists_all_unweighted.feather")

Plot distance from prompt

dist_means <- all_essay_distances_from_prompt %>%
    filter(model_lang != "ja",
           model_lang != "th",
           model_lang != "zh") %>%
  group_by(model_lang, essay_lang) %>%
  summarize(mean = mean(cosine_dist),
            abs_mean = mean(abs(cosine_dist)))

ggplot(dist_means, aes(x = mean)) +
  geom_histogram() +
  facet_wrap(~model_lang, scales = "free")

dist_means %>%
  group_by(model_lang) %>%
  arrange(-mean) %>%
  slice(1:2) %>%
  as.data.frame() %>%
  kable()

model_lang	essay_lang	mean	abs_mean
ar	ja	0.8851479	0.8851479
ar	fa	0.8844884	0.8844884
bg	vi	0.9106663	0.9106663
bg	es	0.9106343	0.9106343
bn	ja	0.9058845	0.9058845
bn	ko	0.9054729	0.9054729
de	ko	0.9346251	0.9346251
de	vi	0.9338602	0.9338602
el	fa	0.9008149	0.9008149
el	te	0.9001061	0.9001061
es	yo	0.9346893	0.9346893
es	ig	0.9346745	0.9346745
fa	ko	0.9387155	0.9387155
fa	th	0.9384138	0.9384138
fr	zh	0.9370297	0.9370297
fr	ja	0.9369826	0.9369826
gu	ja	0.8907143	0.8907143
gu	vi	0.8900985	0.8900985
hi	ko	0.9328379	0.9328379
hi	th	0.9326936	0.9326936
id	ja	0.9441595	0.9441595
id	ko	0.9422861	0.9422861
ig	pl	0.9885578	0.9885578
ig	ko	0.9884597	0.9884597
it	ko	0.9254831	0.9254831
it	vi	0.9239440	0.9239440
kn	ko	0.8464824	0.8464824
kn	ja	0.8460231	0.8460231
ko	ja	0.8579011	0.8579011
ko	ko	0.8574635	0.8574635
ml	ja	0.7896742	0.7896742
ml	vi	0.7893546	0.7893546
mr	ja	0.8544361	0.8544361
mr	ko	0.8534093	0.8534093
ne	ko	0.8693709	0.8693709
ne	vi	0.8675273	0.8675273
nl	ja	0.9532651	0.9532651
nl	vi	0.9531085	0.9531085
pa	gu	0.9426215	0.9426215
pa	zh	0.9418346	0.9418346
pl	zh	0.9044955	0.9044955
pl	id	0.9039826	0.9039826
pt	ko	0.9371789	0.9371789
pt	ig	0.9367161	0.9367161
ro	ja	0.9250079	0.9250079
ro	zh	0.9245983	0.9245983
ru	id	0.8984596	0.8984596
ru	vi	0.8980968	0.8980968
ta	ko	0.8425086	0.8425086
ta	ja	0.8411394	0.8411394
te	ko	0.8750108	0.8750108
te	ja	0.8741812	0.8741812
tl	gu	0.9397898	0.9397898
tl	en	0.9388644	0.9388644
tr	ja	0.8617532	0.8617532
tr	ko	0.8590297	0.8590297
ur	ko	0.9435368	0.9435368
ur	zh	0.9434326	0.9434326
vi	ja	0.9216203	0.9216203
vi	ru	0.9194698	0.9194698
yo	gu	0.9762915	0.9762915
yo	ne	0.9762407	0.9762407

t-test

by_class <- dist_means %>%
  mutate(group = ifelse(essay_lang == model_lang, "within", "across")) %>%
  group_by(model_lang, group) %>%
  summarize(mean = mean(mean)) %>%
  spread(group, mean) %>%
  mutate(diff = across - within)

t.test(by_class$diff, mu = 0)

## 
##  One Sample t-test
## 
## data:  by_class$diff
## t = 1.0232, df = 30, p-value = 0.3144
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.0004848394  0.0014585233
## sample estimates:
##   mean of x 
## 0.000486842

lmer model

all_essay_distances_from_prompt2 <- all_essay_distances_from_prompt %>%
  mutate(group = ifelse(essay_lang == model_lang, "within", "across"))  %>%
  filter(model_lang != "ja",
          model_lang != "th",
          model_lang != "zh") 

mod1 <- lmer(cosine_dist ~ group + (1|essay_lang)  + (1|model_lang), all_essay_distances_from_prompt2)
summary(mod1)

## Linear mixed model fit by REML ['lmerMod']
## Formula: cosine_dist ~ group + (1 | essay_lang) + (1 | model_lang)
##    Data: all_essay_distances_from_prompt2
## 
## REML criterion at convergence: -152589.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.7632 -0.5429  0.0566  0.6132  3.1232 
## 
## Random effects:
##  Groups     Name        Variance  Std.Dev.
##  essay_lang (Intercept) 2.630e-06 0.001622
##  model_lang (Intercept) 2.101e-03 0.045837
##  Residual               3.812e-04 0.019525
## Number of obs: 30380, groups:  essay_lang, 35; model_lang, 31
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  0.9040626  0.0082380   109.7
## groupwithin -0.0002688  0.0006735    -0.4
## 
## Correlation of Fixed Effects:
##             (Intr)
## groupwithin -0.002

LOW ESSAYS