Are languages closer to the prompt in their native language model? Unweighted essay locations.

GOOGLE_LANGS <- c("ar", "bn", "bg", "zh", "nl",  "fr", "de", "el", "en",
                  "gu", "hi", "ig", "id", "it", "ja", "kn", "ko",
                  "ml", "mr", "ne", "pa", "pl", "pt", "ro", "ru", 
                  "es", "tl", "ta", "te", "th", "tr", "ur", "vi", "yo",
                  "fa") 

Read in essay metadata

metadata <- read_csv("../../../data/raw/merged_metadata.csv")  %>%
  mutate(essay_id = as.character(essay_id)) %>%
  mutate(score_bin = ifelse(score < 4, "low", "high"))

Read in essay locations and merge in metadata

essay_path <- "../../../data/processed/translated_essay_vectors/"
files <- list.files(essay_path)[c(-30, -32)]

essays_vectors <- map_df(files, function(x){read_feather(paste0(essay_path,x))}) %>%
  data.table()

essays_vectors_unweighted <- essays_vectors[word_weighting == "unweighted"] %>%
  merge(metadata %>% select(essay_id, prompt_id, score_bin),
         all.x = TRUE)

setcolorder(essays_vectors_unweighted, 
            c(1:3, 305:306, 4:304))

Read in prompt locations

prompt_centroids <- read_csv("../../../data/processed/prompt_translations/native_prompt_embeddings_study2.csv")
prompt_ids <- unique(prompt_centroids$prompt_id)

Get centroids of each essay_lang x prompt x model_lang

ixs  <- expand.grid(prompt_id = prompt_ids,
                    model_lang = GOOGLE_LANGS[-9],  # english is missing
                    essay_lang = GOOGLE_LANGS,
                    stringsAsFactors = FALSE) 

all_essay_centroids <- pmap_df(list(ixs$prompt_id, ixs$model_lang, ixs$essay_lang), 
     get_lang_mod_centroids, 
     essays_vectors_unweighted)


#write_feather(all_essay_centroids, "../../../data/processed/translated_essay_centroids/translated_essay_centroids_all_unweighted.feather")
all_essay_centroids <- read_feather("../../../data/processed/translated_essay_centroids/translated_essay_centroids_all_unweighted.feather") %>%
  data.table()

ALL ESSAYS

For each essay_lang x prompt x model_lang centroid, get distance from prompt

all_essay_distances_from_prompt <- pmap_df(
     list(ixs$prompt_id, ixs$model_lang, ixs$essay_lang), 
     get_dist_from_prompt, 
     all_essay_centroids,
     prompt_centroids)

#write_feather(all_essay_distances_from_prompt, "../../../data/processed/translated_essay_centroids/translated_prompt_dists_all_unweighted.feather")
all_essay_distances_from_prompt <- read_feather("../../../data/processed/translated_essay_centroids/translated_prompt_dists_all_unweighted.feather") 

Plot distance from prompt

dist_means <- all_essay_distances_from_prompt %>%
    filter(model_lang != "ja",
           model_lang != "th",
           model_lang != "zh") %>%
  group_by(model_lang, essay_lang) %>%
  summarize(mean = mean(cosine_dist),
            abs_mean = mean(abs(cosine_dist)))

ggplot(dist_means, aes(x = mean)) +
  geom_histogram() +
  facet_wrap(~model_lang, scales = "free")

dist_means %>%
  group_by(model_lang) %>%
  arrange(-mean) %>%
  slice(1:2) %>%
  as.data.frame() %>%
  kable()
model_lang essay_lang mean abs_mean
ar ja 0.8851479 0.8851479
ar fa 0.8844884 0.8844884
bg vi 0.9106663 0.9106663
bg es 0.9106343 0.9106343
bn ja 0.9058845 0.9058845
bn ko 0.9054729 0.9054729
de ko 0.9346251 0.9346251
de vi 0.9338602 0.9338602
el fa 0.9008149 0.9008149
el te 0.9001061 0.9001061
es yo 0.9346893 0.9346893
es ig 0.9346745 0.9346745
fa ko 0.9387155 0.9387155
fa th 0.9384138 0.9384138
fr zh 0.9370297 0.9370297
fr ja 0.9369826 0.9369826
gu ja 0.8907143 0.8907143
gu vi 0.8900985 0.8900985
hi ko 0.9328379 0.9328379
hi th 0.9326936 0.9326936
id ja 0.9441595 0.9441595
id ko 0.9422861 0.9422861
ig pl 0.9885578 0.9885578
ig ko 0.9884597 0.9884597
it ko 0.9254831 0.9254831
it vi 0.9239440 0.9239440
kn ko 0.8464824 0.8464824
kn ja 0.8460231 0.8460231
ko ja 0.8579011 0.8579011
ko ko 0.8574635 0.8574635
ml ja 0.7896742 0.7896742
ml vi 0.7893546 0.7893546
mr ja 0.8544361 0.8544361
mr ko 0.8534093 0.8534093
ne ko 0.8693709 0.8693709
ne vi 0.8675273 0.8675273
nl ja 0.9532651 0.9532651
nl vi 0.9531085 0.9531085
pa gu 0.9426215 0.9426215
pa zh 0.9418346 0.9418346
pl zh 0.9044955 0.9044955
pl id 0.9039826 0.9039826
pt ko 0.9371789 0.9371789
pt ig 0.9367161 0.9367161
ro ja 0.9250079 0.9250079
ro zh 0.9245983 0.9245983
ru id 0.8984596 0.8984596
ru vi 0.8980968 0.8980968
ta ko 0.8425086 0.8425086
ta ja 0.8411394 0.8411394
te ko 0.8750108 0.8750108
te ja 0.8741812 0.8741812
tl gu 0.9397898 0.9397898
tl en 0.9388644 0.9388644
tr ja 0.8617532 0.8617532
tr ko 0.8590297 0.8590297
ur ko 0.9435368 0.9435368
ur zh 0.9434326 0.9434326
vi ja 0.9216203 0.9216203
vi ru 0.9194698 0.9194698
yo gu 0.9762915 0.9762915
yo ne 0.9762407 0.9762407

t-test

by_class <- dist_means %>%
  mutate(group = ifelse(essay_lang == model_lang, "within", "across")) %>%
  group_by(model_lang, group) %>%
  summarize(mean = mean(mean)) %>%
  spread(group, mean) %>%
  mutate(diff = across - within)

t.test(by_class$diff, mu = 0)
## 
##  One Sample t-test
## 
## data:  by_class$diff
## t = 1.0232, df = 30, p-value = 0.3144
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.0004848394  0.0014585233
## sample estimates:
##   mean of x 
## 0.000486842

lmer model

all_essay_distances_from_prompt2 <- all_essay_distances_from_prompt %>%
  mutate(group = ifelse(essay_lang == model_lang, "within", "across"))  %>%
  filter(model_lang != "ja",
          model_lang != "th",
          model_lang != "zh") 

mod1 <- lmer(cosine_dist ~ group + (1|essay_lang)  + (1|model_lang), all_essay_distances_from_prompt2)
summary(mod1)
## Linear mixed model fit by REML ['lmerMod']
## Formula: cosine_dist ~ group + (1 | essay_lang) + (1 | model_lang)
##    Data: all_essay_distances_from_prompt2
## 
## REML criterion at convergence: -152589.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.7632 -0.5429  0.0566  0.6132  3.1232 
## 
## Random effects:
##  Groups     Name        Variance  Std.Dev.
##  essay_lang (Intercept) 2.630e-06 0.001622
##  model_lang (Intercept) 2.101e-03 0.045837
##  Residual               3.812e-04 0.019525
## Number of obs: 30380, groups:  essay_lang, 35; model_lang, 31
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  0.9040626  0.0082380   109.7
## groupwithin -0.0002688  0.0006735    -0.4
## 
## Correlation of Fixed Effects:
##             (Intr)
## groupwithin -0.002

LOW ESSAYS

Get centroids of each essay_lang x prompt x model_lang

essays_vectors_unweighted_low <- filter(essays_vectors_unweighted, score_bin == "low") %>%
  data.table()

ixs  <- expand.grid(prompt_id = prompt_ids,
                    model_lang = GOOGLE_LANGS[-9],  # english is missing
                    essay_lang = GOOGLE_LANGS,
                    stringsAsFactors = FALSE) 

low_essay_centroids <- pmap_df(list(ixs$prompt_id, ixs$model_lang, ixs$essay_lang), 
     get_lang_mod_centroids, 
     essays_vectors_unweighted_low)

#write_feather(low_essay_centroids, "../../../data/processed/translated_essay_centroids/translated_essay_centroids_low_unweighted.feather")
low_essay_centroids <- read_feather("../../../data/processed/translated_essay_centroids/translated_essay_centroids_low_unweighted.feather") %>%
  data.table()

For each essay_lang x prompt x model_lang centroid, get distance from prompt

low_essay_distances_from_prompt <- pmap_df(
     list(ixs$prompt_id, ixs$model_lang, ixs$essay_lang), 
     get_dist_from_prompt, 
     low_essay_centroids,
     prompt_centroids)

#write_feather(low_essay_distances_from_prompt, "../../../data/processed/translated_essay_centroids/translated_prompt_dists_low_unweighted.feather")
low_essay_distances_from_prompt <- read_feather("../../../data/processed/translated_essay_centroids/translated_prompt_dists_low_unweighted.feather") 

dist_means <- low_essay_distances_from_prompt %>%
   filter(model_lang != "ja",
          model_lang != "th",
          model_lang != "zh")  %>%
  group_by(model_lang, essay_lang) %>%
  summarize(mean = mean(cosine_dist),
            abs_mean = mean(abs(cosine_dist)))

ggplot(dist_means, aes(x = mean)) +
  geom_histogram() +
  facet_wrap(~model_lang, scales = "free")

t-test

by_class <- dist_means %>%
  mutate(group = ifelse(essay_lang == model_lang, "within", "across")) %>%
  group_by(model_lang, group) %>%
  summarize(mean = mean(mean, na.rm = TRUE)) %>%
  spread(group, mean) %>%
  mutate(diff = across-within)

t.test(by_class$across, by_class$within, paired = TRUE)
## 
##  Paired t-test
## 
## data:  by_class$across and by_class$within
## t = 1.1865, df = 25, p-value = 0.2466
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.0004547501  0.0016908420
## sample estimates:
## mean of the differences 
##            0.0006180459

lmer model

low_essay_distances_from_prompt2 <- low_essay_distances_from_prompt %>%
  mutate(group = ifelse(essay_lang == model_lang, "within", "across"))  %>%
     filter(model_lang != "ja",
            model_lang != "th",
            model_lang != "zh")  

mod2 <- lmer(cosine_dist ~ group + (1|essay_lang) + (1|model_lang) + (1|prompt_id), low_essay_distances_from_prompt2)
summary(mod2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: cosine_dist ~ group + (1 | essay_lang) + (1 | model_lang) + (1 |  
##     prompt_id)
##    Data: low_essay_distances_from_prompt2
## 
## REML criterion at convergence: -162164.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.2938 -0.5246  0.0706  0.5953  3.6269 
## 
## Random effects:
##  Groups     Name        Variance  Std.Dev.
##  essay_lang (Intercept) 3.683e-06 0.001919
##  model_lang (Intercept) 2.068e-03 0.045473
##  prompt_id  (Intercept) 1.298e-04 0.011393
##  Residual               2.659e-04 0.016306
## Number of obs: 30163, groups:  
## essay_lang, 35; model_lang, 31; prompt_id, 28
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  0.9038494  0.0084529   106.9
## groupwithin -0.0002819  0.0005648    -0.5
## 
## Correlation of Fixed Effects:
##             (Intr)
## groupwithin -0.002