Here we look at the difference in the distributions of low and high scoring esssays along two measures, mean and variance. We do two versions of this analysis: (A) Pooling across all languages, and (B) By language. For each, we downsample so that there are the same number in each lang x prompt x score bin.

SUMMARY: Lower scoring languages are closer to the mean (both pooling across languages and by language). Lower scoring essays have more variance.


Parameters

CUTOFF <- 4
N_PER_L_SCORE_PROMPT <- 10
MIN_SCORE <- 2

1 Data

Read in doctag indices, docvecs, and metadata

doctag_indices <- read_feather("../../../data/processed/models/all_model/doctag_indices_all_model.feather") 
docvecs <- read_feather("../../../data/processed/models/all_model/docvecs_all_model.feather") %>%
  as.data.frame() %>%
  mutate(offset = 0:(n()-1)) %>%
  select(offset, everything())

metadata <- read_csv("../../../data/raw/merged_metadata.csv") 
metadata_clean <- metadata %>%
  mutate_if(is.character, as.factor)  %>%
  mutate(essay_id = as.character(essay_id))
n_scores_not_na <- metadata_clean %>% 
  filter(!is.na(score)) %>% 
  nrow()

Of all essays, we have scores for 100% of essays. Note that unlike Study 1, we only have whole scores (5 values).

Merge all data sources together

d <- doctag_indices %>%
  left_join(metadata_clean) %>%
  left_join(docvecs) %>%
  nest(-1:-10, .key = "doc_vector") %>%
  select(-doc_count, -offset, -test_center_country_code)

Get TSNE coordinates. These are the same for both the all-languages and by-language analyses.

#mats <- d %>%
#  select(doc_vector) %>%
#  unnest() %>%
#  as.matrix()

#tsne_out = Rtsne::Rtsne(mats)
#tsne_dims <- tsne_out$Y %>%
#  as.data.frame() %>%
#  rename(tsne_X = V1,
#         tsne_Y = V2) %>%
#  bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
#  select(everything(), tsne_X, tsne_Y)
# write_csv(tsne_dims, "../all_data/tsne_dims_cached.csv")
tsne_dims <- read_csv("../../../data/processed/tsne/tsne_dims_cached_all.csv")

2 Score distributions

tsne_dims %>%
  ggplot(aes(x = score)) +
  geom_histogram(binwidth = 1) +
  geom_vline(aes(xintercept = mean(tsne_dims$score)), color = "red") +
  ggtitle("overall score distribution") +
  theme_bw()

tsne_dims %>%
  ggplot(aes(x = score, fill = L1_code)) +
  geom_histogram(binwidth = 1) +
  facet_wrap(~L1_code) +
  #geom_density() +
  ggtitle("Score distribution by language") +
  theme_bw() +
  theme(legend.position = "none")

tsne_dims %>%
    count(score, L1_code, prompt_id) %>%
    ggplot(aes(x = n)) +
    geom_histogram(binwidth = 1) +
    ggtitle("Number of essays per language, score, prompt bin, including score = 1") +
    theme_bw() 

tsne_dims %>%
    filter(score >= MIN_SCORE) %>%
    count(score, L1_code, prompt_id) %>%
    ggplot(aes(x = n)) +
    geom_histogram(binwidth = 1) +
    ggtitle("Number of essays per language, score, prompt bin, excluding score = 1") +
    theme_bw() 

MEAN_N_PER_L_SCORE_PROMPT <- tsne_dims %>%
    filter(score >= MIN_SCORE) %>%
    count(score, L1_code, prompt_id) %>%
    summarize(n = mean(n))

3 Downsample

Scores are unevenly distributed across languages; need to downsample. Here were doing 10 per language, score_bin and prompt (mean = 10.1295406)

Downsample essays.

tsne_dims_downsampled <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  filter(score >= MIN_SCORE) %>%
  group_by(L1_code, score, prompt_id) %>%
  sample_n(N_PER_L_SCORE_PROMPT, replace = T) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
         score_prompt = paste(score_bin, prompt_id, sep = "_"),
         L1_score_prompt = paste(L1_code, score_bin, prompt_id, sep = "_"),
         L1_prompt = paste(L1_code, prompt_id, sep = "_")) %>%
  ungroup()

The bins:

tsne_dims_downsampled %>%
  group_by(score_bin) %>%
  distinct(score)
## # A tibble: 4 x 2
## # Groups:   score_bin [2]
##   score score_bin
##   <int>     <chr>
## 1     2       low
## 2     3       low
## 3     4      high
## 4     5      high

4 All Languages

4.1 Mean

Centroid = grand center

get_group_centroid <- function(doc_vecs_df, group_name){
  centroid <- doc_vecs_df %>%
    select(doc_vector) %>%
    unnest(doc_vector) %>%
    colMeans() 

  this_group_name  <- doc_vecs_df %>%
    select_(group_name) %>%
    slice(1) %>%
    unlist()

  df <- data.frame(t(centroid)) %>%
    mutate(x = this_group_name) %>%
    select(x, everything())
  
   names(df)[1] = c(as.name(group_name))
   df
}

# L1_score_prompt centers
tsne_centroids_BL_score_prompt <- tsne_dims_downsampled %>%
  split(.$L1_score_prompt) %>%
  map_df(get_group_centroid, "L1_score_prompt") %>%
  separate(L1_score_prompt, c("L1_code", "score_bin", "prompt_id"), "_") %>%
  nest(4:5, .key = "doc_vector")
grand_centroids <- tsne_centroids_BL_score_prompt %>% # SPL
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% # 
  map_df(get_group_centroid, "L1_prompt") %>% # PL
  separate(L1_prompt, c("L1_code", "prompt_id"), "_")  %>%
  group_by(prompt_id) %>%
  summarize(tsne_X = mean(tsne_X), #P
            tsne_Y = mean(tsne_Y)) %>%
  mutate(group = "all") %>%
  group_by(group) %>%
  summarize(center_tsne_X = mean(tsne_X), # 1
            center_tsne_Y = mean(tsne_Y))

group_centroids <- tsne_centroids_BL_score_prompt %>% # SPL
  mutate(prompt_score = paste0(prompt_id, "_" , score_bin)) %>%
  split(.$prompt_score) %>%
  map_df(get_group_centroid, "prompt_score")  %>% # SP
  mutate(group = "all") %>%
  separate(prompt_score, c("prompt_id", "score_bin"), "_") 
  
distance_measures <- group_centroids %>%
  left_join(grand_centroids) %>%
  rowwise() %>%
  mutate(dist_from_grand_mean = dist(rbind(c(center_tsne_X,
                                                center_tsne_Y),
                                              c(tsne_X, tsne_Y)), method = "euclidean")[1])

bin_dists <- distance_measures %>%
  group_by(score_bin) %>%
  multi_boot_standard(col = "dist_from_grand_mean")

kable(bin_dists)
score_bin ci_lower ci_upper mean
high 14.9029 20.63314 17.76401
low 13.6166 19.07722 16.35866
ggplot(bin_dists, aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Distance from grand centroid (bootstrapping across prompts)") +
  ylab("distance from grand centroid") +
  theme_minimal()

Model

model_data <- tsne_dims_downsampled %>%
  mutate(group = "all") %>%
  left_join(grand_centroids) %>%
  unnest() %>%
  rowwise() %>%
  mutate(dist_from_grand_mean = 
           dist(rbind(c(center_tsne_X, center_tsne_Y),
                      c(tsne_X, tsne_Y)), method = "euclidean")[1])

# these don't converge
#model <- lmer(dist_from_grand_mean ~  score_bin + (score_bin|prompt_id) + (prompt_id|L1_code) +  (score_bin|L1_code), data= model_data,  control=lmerControl(optimizer = "bobyqa"))

#model <- lmer(dist_from_grand_mean ~  score_bin + (score_bin|prompt_id) + (prompt_id|L1_code) , data= model_data,  control=lmerControl(optimizer = "bobyqa"))

#model <- lmer(dist_from_grand_mean ~  score_bin +  (prompt_id|L1_code) , data= model_data,  control=lmerControl(optimizer = "bobyqa"))

model <- lmer(dist_from_grand_mean ~  score_bin +  (1|L1_code) + (1|prompt_id) , data= model_data,  control=lmerControl(optimizer = "bobyqa"))
summary(model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## dist_from_grand_mean ~ score_bin + (1 | L1_code) + (1 | prompt_id)
##    Data: model_data
## Control: lmerControl(optimizer = "bobyqa")
## 
## REML criterion at convergence: 196041.3
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -8.1975 -0.3182  0.0811  0.4600  7.2130 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  L1_code   (Intercept)  0.1221  0.3494  
##  prompt_id (Intercept) 52.4049  7.2391  
##  Residual              10.9067  3.3025  
## Number of obs: 37440, groups:  L1_code, 35; prompt_id, 28
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  18.28218    1.36955   13.35
## score_binlow -1.25799    0.03418  -36.81
## 
## Correlation of Fixed Effects:
##             (Intr)
## score_binlw -0.012

Lowering scoring languages are farther away from the overall group mean.

4.2 Variance

Here are scatter and 2D density plots for each prompt and score bin.

ggplot(tsne_dims_downsampled %>% unnest(), 
       aes(color = score_bin, x = tsne_X, y = tsne_Y)) +
  facet_wrap(~prompt_id, nrow = 4) + 
  geom_point(size = .2, alpha = .3) +
  theme_minimal()

Here is diameter as the average distance to all other points:

tsne_diameters_LA_mean <- tsne_dims_downsampled %>%
  split(.$score_prompt) %>% 
  map_df(get_group_diameter, "score_prompt", "mean_dist")  %>%
  separate(score_prompt, c("score_bin","prompt_id"), sep = "_")

tsne_diameters_LA_mean %>%
  group_by(score_bin) %>%
  multi_boot_standard(col = "diameter") %>%
  ggplot(aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Diameter, averaging across prompts") +
  ylab("mean diameter (average dist to all within-group points)") +
  theme_minimal() 

With this measure, low scoring essays have more variance.

low = filter(tsne_diameters_LA_mean, score_bin == "low")
high = filter(tsne_diameters_LA_mean, score_bin == "high")

t.test(low$diameter, high$diameter, paired = TRUE)
## 
##  Paired t-test
## 
## data:  low$diameter and high$diameter
## t = 4.2199, df = 27, p-value = 0.0002468
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.4745154 1.3726586
## sample estimates:
## mean of the differences 
##                0.923587

5 By Language

5.1 Mean

By languages - Language centroid

grand_centroids <- tsne_centroids_BL_score_prompt %>% # SPL
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>%
  map_df(get_group_centroid, "L1_prompt") %>% # PL
  separate(L1_prompt, c("L1_code",  "prompt_id"), "_") %>%
  nest(3:4, .key = "doc_vector") %>%
  split(.$L1_code) %>%
  map_df(get_group_centroid, "L1_code")  %>%
  rename(center_tsne_X = tsne_X,
        center_tsne_Y = tsne_Y)

distance_measures <- tsne_centroids_BL_score_prompt %>%
  unnest() %>%
  left_join(grand_centroids) %>%
  rowwise () %>%
  mutate(dist_from_grand_mean = 
           dist(rbind(c(center_tsne_X, center_tsne_Y),
                      c(tsne_X, tsne_Y)), method = "euclidean")[1])

bin_dists <- distance_measures %>%
  group_by(L1_code, score_bin) %>%
  multi_boot_standard(col = "dist_from_grand_mean")

ggplot(bin_dists, aes(x = L1_code, y = mean, group = score_bin, fill = score_bin)) +
  geom_bar(stat = "identity", position = position_dodge() ) +
  xlab("Native language") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), position = position_dodge(.9)) +
  ylab("Euclidean distance from language centroid") +
  theme_minimal() +
  theme(text = element_text(size = 15),
          axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_fill_brewer(palette = "Set1", name = "Score") 

kable(bin_dists)
L1_code score_bin ci_lower ci_upper mean
ARA high 14.50854 20.31290 17.35065
ARA low 13.42778 18.60312 15.92734
BEN high 16.05779 21.90380 18.89651
BEN low 13.78415 19.82368 16.86073
BUL high 14.24274 19.81432 17.21946
BUL low 13.05452 19.16474 16.12481
CHI high 15.38462 21.06080 18.21525
CHI low 13.39906 18.50594 16.04284
DUT high 14.96819 20.59136 17.85774
DUT low 12.82454 18.80713 15.88590
ENG high 15.35845 21.17618 18.16705
ENG low 14.13516 19.40165 16.76981
FAS high 14.14614 20.07018 17.29799
FAS low 12.82309 18.41262 15.52350
FRE high 14.61921 20.40549 17.65681
FRE low 13.23382 18.72757 15.97212
GER high 14.97270 20.68973 17.82031
GER low 13.62390 19.17379 16.39672
GRE high 14.45674 20.36418 17.56723
GRE low 12.71918 18.38236 15.73442
GUJ high 14.36836 20.57892 17.63308
GUJ low 14.19541 19.59639 16.82177
HIN high 15.41619 21.09405 18.33358
HIN low 14.59187 20.14962 17.35096
IBO high 15.07161 21.07239 18.06111
IBO low 13.64845 18.74564 16.27427
IND high 13.95436 20.12811 17.13321
IND low 13.07838 18.61289 15.93580
ITA high 14.40358 20.30740 17.37258
ITA low 13.40478 18.74980 16.14432
JPN high 14.33399 20.21688 17.25782
JPN low 13.47457 19.01434 16.17036
KAN high 15.60864 21.75202 18.74677
KAN low 13.54344 19.61928 16.61888
KOR high 15.36967 20.57560 17.91790
KOR low 13.56424 18.81132 16.32295
MAL high 14.89661 20.97647 18.02225
MAL low 14.38193 20.32834 17.22895
MAR high 15.17457 21.11376 18.06121
MAR low 14.38692 20.03161 17.23579
NEP high 15.03110 21.51021 18.32608
NEP low 14.85312 20.22407 17.50173
PAN high 14.79346 20.67256 17.98117
PAN low 14.38446 19.61306 17.18925
POL high 14.55938 20.46271 17.58511
POL low 13.63650 18.82258 16.33064
POR high 14.66998 20.24575 17.65120
POR low 13.03345 18.45268 15.69171
RUM high 14.69363 20.56084 17.67140
RUM low 12.95566 18.20951 15.76591
RUS high 15.07623 20.55559 17.84764
RUS low 14.01565 19.44334 16.70459
SPA high 14.78474 20.35636 17.62049
SPA low 12.85185 18.51199 15.65550
TAM high 15.09655 21.02593 18.22260
TAM low 14.46361 19.94942 17.12117
TEL high 15.01908 20.73831 17.98919
TEL low 14.13862 19.65750 16.88112
TGL high 14.65664 20.06857 17.51809
TGL low 14.23585 19.65155 16.85779
THA high 15.28873 20.75167 17.81894
THA low 13.81847 19.33315 16.60962
TUR high 14.37129 20.26415 17.40229
TUR low 14.12079 19.27540 16.55622
URD high 15.20931 21.26715 18.21548
URD low 14.28098 19.57639 17.00679
VIE high 14.82609 20.32882 17.67133
VIE low 12.98558 18.37910 15.69448
YOR high 15.40881 21.29999 18.31916
YOR low 13.54515 19.39071 16.53499

Model

model_data <- tsne_dims_downsampled %>%
  left_join(grand_centroids) %>%
  unnest() %>%
  rowwise() %>%
  mutate(dist_from_grand_mean = 
           dist(rbind(c(center_tsne_X, center_tsne_Y),
                      c(tsne_X, tsne_Y)), method = "euclidean")[1])
model_data %>%
  group_by(L1_code) %>%
  do(tidy(lmer(dist_from_grand_mean ~  score_bin + (score_bin|prompt_id), data=.,control=lmerControl(optimizer = "bobyqa")))) %>%
  filter(term == "score_binlow") %>%
  arrange(statistic) %>%
  kable()
L1_code term estimate std.error statistic group
BEN score_binlow -2.0482763 0.3133251 -6.5372246 fixed
JPN score_binlow -1.3008434 0.2276088 -5.7152607 fixed
POR score_binlow -1.9791854 0.3535199 -5.5985123 fixed
KOR score_binlow -1.1535076 0.2146429 -5.3740769 fixed
FRE score_binlow -1.0834534 0.2105391 -5.1460904 fixed
SPA score_binlow -1.3773475 0.2989218 -4.6077185 fixed
RUM score_binlow -1.8094388 0.4096531 -4.4170021 fixed
ENG score_binlow -1.8065191 0.4389514 -4.1155333 fixed
DUT score_binlow -1.3683001 0.3400837 -4.0234216 fixed
CHI score_binlow -1.6130749 0.4252086 -3.7936086 fixed
VIE score_binlow -1.5968721 0.4245418 -3.7614013 fixed
YOR score_binlow -1.7928780 0.4818305 -3.7209723 fixed
BUL score_binlow -1.4105576 0.3802423 -3.7096282 fixed
TAM score_binlow -1.3301323 0.3595957 -3.6989661 fixed
URD score_binlow -1.1841224 0.3247621 -3.6461227 fixed
GER score_binlow -1.2124774 0.3427081 -3.5379300 fixed
ARA score_binlow -1.4748536 0.4258047 -3.4636853 fixed
POL score_binlow -1.1744431 0.3450250 -3.4039360 fixed
GRE score_binlow -1.6208330 0.4967683 -3.2627547 fixed
TEL score_binlow -1.1784583 0.3646043 -3.2321566 fixed
FAS score_binlow -1.4194862 0.4558191 -3.1141435 fixed
IBO score_binlow -1.3715324 0.4635671 -2.9586492 fixed
THA score_binlow -1.0404292 0.3799435 -2.7383789 fixed
KAN score_binlow -1.5238304 0.5570942 -2.7353189 fixed
HIN score_binlow -0.9098248 0.3566241 -2.5512152 fixed
TUR score_binlow -0.7708002 0.3064908 -2.5149213 fixed
ITA score_binlow -0.9183046 0.3806489 -2.4124713 fixed
TGL score_binlow -0.6183860 0.2580783 -2.3961182 fixed
NEP score_binlow -0.8070802 0.3420564 -2.3594948 fixed
GUJ score_binlow -1.0060198 0.4411161 -2.2806237 fixed
MAL score_binlow -0.8865862 0.4395371 -2.0170907 fixed
RUS score_binlow -0.7835864 0.3926972 -1.9953957 fixed
IND score_binlow -0.9541267 0.5379901 -1.7735024 fixed
MAR score_binlow -0.6941345 0.4249078 -1.6336121 fixed
PAN score_binlow -0.2638990 0.3569011 -0.7394177 fixed

Across all languages, lower scoring essays are closer to group mean.

5.1.0.1 Effect sizes:

bin_dists_low <- distance_measures %>%
  group_by(L1_code, score_bin) %>%
  summarize(sd = sd(dist_from_grand_mean),
            mean = mean(dist_from_grand_mean),
            n = n()) %>%
filter(score_bin == "low") %>%
  rename(low_sd = sd,
         low_mean = mean,
         low_n = n)

bin_dists_high <- distance_measures %>%
  group_by(L1_code, score_bin) %>%
  summarize(sd = sd(dist_from_grand_mean),
            mean = mean(dist_from_grand_mean),
            n = n()) %>%
filter(score_bin == "high") %>%
  rename(high_sd = sd,
         high_mean = mean,
         high_n = n)

all_dists = inner_join(bin_dists_low, bin_dists_high, by = "L1_code")

effect_sizes = all_dists %>%
  rowwise()  %>%
  mutate(d = compute.es::mes( high_mean, low_mean, high_sd, low_sd,  high_n * 10, low_n * 10, verbose = FALSE)$d)

effect_sizes %>%
  select(L1_code, d) %>%
  arrange(-d) %>%
  kable()
L1_code d
CHI 0.29
KAN 0.26
VIE 0.26
BEN 0.25
DUT 0.25
POR 0.25
RUM 0.25
SPA 0.25
GRE 0.24
FAS 0.23
YOR 0.23
IBO 0.22
FRE 0.21
KOR 0.21
ENG 0.19
ARA 0.18
GER 0.18
ITA 0.16
POL 0.16
RUS 0.16
THA 0.16
IND 0.15
URD 0.15
JPN 0.14
TAM 0.14
TEL 0.14
BUL 0.13
HIN 0.12
TUR 0.11
GUJ 0.10
MAL 0.10
MAR 0.10
NEP 0.10
PAN 0.10
TGL 0.09

5.2 Variance

Diameter as the average distance to all other points:

tsne_diameters_LB_mean <- tsne_dims_downsampled %>%
  split(.$L1_score_prompt) %>% 
  map_df(get_group_diameter, "L1_score_prompt", "mean_dist")  %>%
  separate(L1_score_prompt, c("L1_code" ,"score_bin", "prompt_id"), sep = "_") 

tsne_diameters_LB_mean %>%
  group_by(score_bin, L1_code) %>%
  multi_boot_standard(col = "diameter", na.rm = T) %>%
  ggplot(aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  facet_wrap(~L1_code, nrow = 5) +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Diameter, averaging across prompts") +
  ylab("mean diameter (average dist to all within-group points)") +
  theme_minimal()

get_uneven_tscore <- function(current_lang, tsne_diameters_LB_mean){
 m <- tsne_diameters_LB_mean %>% 
   filter(L1_code == current_lang)
good_ids <- m %>%
  count(prompt_id) %>%
  as.data.frame() %>%
  filter(n == 2)

  k <-  m %>%
  filter(prompt_id %in% good_ids$prompt_id) 
  t.value <- t.test(k$diameter ~ k$score_bin, 
       paired = TRUE)$statistic
  
  data.frame(t.value, L1_code = current_lang)
}


map_df(unique(tsne_diameters_LB_mean$L1_code), get_uneven_tscore,tsne_diameters_LB_mean) %>%
    gather("L1_code", "t.value") %>%
  arrange(t.value) %>%
  kable()
t.value L1_code
-4.3482652 CHI
-2.7806569 IBO
-2.4520653 ITA
-2.4110134 FAS
-1.9970025 BEN
-1.9844677 URD
-1.9796668 RUM
-1.9622424 VIE
-1.9094973 FRE
-1.8241657 KAN
-1.7478173 SPA
-1.6417110 YOR
-1.4032851 GRE
-1.3187022 RUS
-1.2189869 PAN
-1.1763872 MAR
-1.1432129 KOR
-0.9390124 POL
-0.8545246 ARA
-0.8229936 GUJ
-0.7858954 HIN
-0.6728167 IND
-0.6575782 TAM
-0.5593251 TUR
-0.5472320 TEL
-0.5067167 NEP
-0.4851766 JPN
-0.2190358 THA
-0.1851714 GER
-0.0495870 MAL
0.0962739 ENG
0.1134695 POR
0.4225591 TGL
0.5786750 DUT
2.5375746 BUL

There’s no effect here, except for a few languages.