Here we look at the difference in the distributions of low and high scoring esssays along two measures, mean and variance. We do two versions of this analysis: (A) Pooling across all languages, and (B) By language. For each, we downsample so that there are the same number in each lang x prompt x score bin.

SUMMARY: Lower scoring languages are closer to the mean (both pooling across languages and by language). More variance for lower scoring essays, but this is driven by just 1 or 2 languages


Parameters

CUTOFF <- 4
N_PER_L_SCORE_PROMPT <- 50
MIN_SCORE <- 2

1 Data

Read in doctag indices, docvecs, and metadata

doctag_indices <- read_feather("../../all_data/doctag_indices.feather") 
docvecs <- read_feather("../../all_data/docvecs.feather") %>%
  as.data.frame() %>%
  mutate(offset = 0:(n()-1)) %>%
  select(offset, everything())

#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../../all_data/raw/merged_metadata.csv") 
metadata_clean <- metadata %>%
  mutate_if(is.character, as.factor)  %>%
  mutate(essay_id = as.character(essay_id)) 
n_scores_not_na <- metadata_clean %>% 
  filter(!is.na(score)) %>% 
  nrow()

Of all essays, we have scores for 91% of essays.

Merge all data sources together

d <- doctag_indices %>%
  left_join(metadata_clean) %>%
  left_join(docvecs) %>%
  nest(-1:-10, .key = "doc_vector") %>%
  select(-doc_count, -offset, -test_center_country_code) %>%
  mutate(prompt_id = fct_recode(prompt_id, 
                                broad.academics = "VC079857", 
                                young.enjoy = "VC139555", 
                                young.help = "VC182427",
                                advertisements = "VC199494",
                                fewer.cars = "VC199744",
                                tour.travel = "VC212639",
                                students.facts = "VC247638",
                                success.risks = "VC251653"))

Get TSNE coordinates. These are the same for both the all-languages and by-language analyses.

#mats <- d %>%
#  select(doc_vector) %>%
#  unnest() %>%
#  as.matrix()

#tsne_out = Rtsne::Rtsne(mats)
#tsne_dims <- tsne_out$Y %>%
#  as.data.frame() %>%
#  rename(tsne_X = V1,
#         tsne_Y = V2) %>%
#  bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
#  select(everything(), tsne_X, tsne_Y)
# write_csv(tsne_dims, "../all_data/tsne_dims_cached.csv")
tsne_dims <- read_csv("../../all_data/tsne_dims_cached.csv")

2 Downsample

Scores are unevenly distributed across languages; need to downsample. Here were doing 50 per language, score_bin and prompt.

ggplot(tsne_dims) +
  geom_histogram(aes(x = score, fill = L1_code), binwidth = .5) +
  facet_wrap(~L1_code) +
  theme_bw() +
  theme(legend.position = "none")

Downsample essays.

tsne_dims_downsampled <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  filter(score > MIN_SCORE) %>%
  filter(!is.na(score)) %>%
  group_by(L1_code, score, prompt_id) %>%
  sample_n(N_PER_L_SCORE_PROMPT, replace = T) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
         score_prompt = paste(score_bin, prompt_id, sep = "_"),
         L1_score_prompt = paste(L1_code, score_bin, prompt_id, sep = "_"),
         L1_prompt = paste(L1_code, prompt_id, sep = "_")) %>%
  ungroup()

The bins:

tsne_dims_downsampled %>%
  group_by(score_bin) %>%
  distinct(score)
## # A tibble: 6 x 2
## # Groups:   score_bin [2]
##   score score_bin
##   <dbl>     <chr>
## 1   2.5       low
## 2   3.0       low
## 3   3.5       low
## 4   4.0      high
## 5   4.5      high
## 6   5.0      high

3 All Languages

3.1 Mean

Centroid = grand center

get_group_centroid <- function(doc_vecs_df, group_name){
  centroid <- doc_vecs_df %>%
    select(doc_vector) %>%
    unnest(doc_vector) %>%
    colMeans() 

  this_group_name  <- doc_vecs_df %>%
    select_(group_name) %>%
    slice(1) %>%
    unlist()

  df <- data.frame(t(centroid)) %>%
    mutate(x = this_group_name) %>%
    select(x, everything())
  
   names(df)[1] = c(as.name(group_name))
   df
}

# L1_score_prompt centers
tsne_centroids_BL_score_prompt <- tsne_dims_downsampled %>%
  split(.$L1_score_prompt) %>%
  map_df(get_group_centroid, "L1_score_prompt") %>%
  separate(L1_score_prompt, c("L1_code", "score_bin", "prompt_id"), "_") %>%
  nest(4:5, .key = "doc_vector")
grand_centroids <- tsne_centroids_BL_score_prompt %>% # SPL
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% # 
  map_df(get_group_centroid, "L1_prompt") %>% # PL
  separate(L1_prompt, c("L1_code", "prompt_id"), "_")  %>%
  group_by(prompt_id) %>%
  summarize(tsne_X = mean(tsne_X), #P
            tsne_Y = mean(tsne_Y)) %>%
  mutate(group = "all") %>%
  group_by(group) %>%
  summarize(center_tsne_X = mean(tsne_X), # 1
            center_tsne_Y = mean(tsne_Y))

group_centroids <- tsne_centroids_BL_score_prompt %>% # SPL
  mutate(prompt_score = paste0(prompt_id, "_" , score_bin)) %>%
  split(.$prompt_score) %>%
  map_df(get_group_centroid, "prompt_score")  %>% # SP
  mutate(group = "all") %>%
  separate(prompt_score, c("prompt_id", "score_bin"), "_") 
  
distance_measures <- group_centroids %>%
  left_join(grand_centroids) %>%
  rowwise() %>%
  mutate(dist_from_grand_mean = dist(rbind(c(center_tsne_X,
                                                center_tsne_Y),
                                              c(tsne_X, tsne_Y)), method = "euclidean")[1])

bin_dists <- distance_measures %>%
  group_by(score_bin) %>%
  multi_boot_standard(col = "dist_from_grand_mean")

kable(bin_dists)
score_bin ci_lower ci_upper mean
high 13.24068 23.42909 18.66798
low 11.13162 21.34514 16.64381
ggplot(bin_dists, aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Distance from grand centroid (bootstrapping across prompts)") +
  ylab("distance from grand centroid") +
  theme_minimal()

Model

model_data <- tsne_dims_downsampled %>%
  mutate(group = "all") %>%
  left_join(grand_centroids) %>%
  unnest() %>%
  rowwise() %>%
  mutate(dist_from_grand_mean = 
           dist(rbind(c(center_tsne_X, center_tsne_Y),
                      c(tsne_X, tsne_Y)), method = "euclidean")[1])

# these don't converge
#model <- lmer(dist_from_grand_mean ~  score_bin + (score_bin|prompt_id) + (prompt_id|L1_code) +  (score_bin|L1_code), data= model_data,  control=lmerControl(optimizer = "bobyqa"))

#model <- lmer(dist_from_grand_mean ~  score_bin + (score_bin|prompt_id) + (prompt_id|L1_code) , data= model_data,  control=lmerControl(optimizer = "bobyqa"))

#model <- lmer(dist_from_grand_mean ~  score_bin +  (prompt_id|L1_code) , data= model_data,  control=lmerControl(optimizer = "bobyqa"))

model <- lmer(dist_from_grand_mean ~  score_bin +  (1|L1_code) + (1|prompt_id) , data= model_data,  control=lmerControl(optimizer = "bobyqa"))
summary(model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## dist_from_grand_mean ~ score_bin + (1 | L1_code) + (1 | prompt_id)
##    Data: model_data
## Control: lmerControl(optimizer = "bobyqa")
## 
## REML criterion at convergence: 25523
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.1360 -0.6089  0.0694  0.7035  4.7005 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  L1_code   (Intercept)  0.8699  0.9327  
##  prompt_id (Intercept) 54.2244  7.3637  
##  Residual               7.5540  2.7485  
## Number of obs: 5230, groups:  L1_code, 11; prompt_id, 8
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  19.16907    2.61916   7.319
## score_binlow -1.89676    0.07603 -24.949
## 
## Correlation of Fixed Effects:
##             (Intr)
## score_binlw -0.014

Lowering scoring languages are farther away from the overall group mean.

3.2 Variance

Here are scatter and 2D density plots for each prompt and score bin.

ggplot(tsne_dims_downsampled %>% unnest(), aes(color = score_bin, x = tsne_X, y = tsne_Y)) +
  facet_wrap(~prompt_id) + 
  geom_point(size = .2, alpha = .3) +
  theme_minimal()

Averaging across prompts, there is no difference betwen high and low scoring bins in terms of their variance when measured as max distance. Here is diameter as the average distance to all other points:

tsne_diameters_LA_mean <- tsne_dims_downsampled %>%
  split(.$score_prompt) %>% 
  map_df(get_group_diameter, "score_prompt", "mean_dist")  %>%
  separate(score_prompt, c("score_bin","prompt_id"), sep = "_")

tsne_diameters_LA_mean %>%
  group_by(score_bin) %>%
  multi_boot_standard(col = "diameter") %>%
  ggplot(aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Diameter, averaging across prompts") +
  ylab("mean diameter (average dist to all within-group points)") +
  theme_minimal() 

With this measure, there is a trend for low scoring essays to have more variance.

low = filter(tsne_diameters_LA_mean, score_bin == "low")
high = filter(tsne_diameters_LA_mean, score_bin == "high")

t.test(low$diameter, high$diameter, paired = TRUE)
## 
##  Paired t-test
## 
## data:  low$diameter and high$diameter
## t = 2.6818, df = 7, p-value = 0.03145
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.05742617 0.91354893
## sample estimates:
## mean of the differences 
##               0.4854876

In a paired t-test, the difference between score bins is reliable But, this trend is driven by Arabic, Telugu and Hindi (see below).

4 By Language

4.1 Mean

By languages - Language centroid

grand_centroids <- tsne_centroids_BL_score_prompt %>% # SPL
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>%
  map_df(get_group_centroid, "L1_prompt") %>% # PL
  separate(L1_prompt, c("L1_code",  "prompt_id"), "_") %>%
  nest(3:4, .key = "doc_vector") %>%
  split(.$L1_code) %>%
  map_df(get_group_centroid, "L1_code")  %>%
  rename(center_tsne_X = tsne_X,
        center_tsne_Y = tsne_Y)

distance_measures <- tsne_centroids_BL_score_prompt %>%
  unnest() %>%
  left_join(grand_centroids) %>%
  rowwise () %>%
  mutate(dist_from_grand_mean = 
           dist(rbind(c(center_tsne_X, center_tsne_Y),
                      c(tsne_X, tsne_Y)), method = "euclidean")[1])

bin_dists <- distance_measures %>%
  group_by(L1_code, score_bin) %>%
  multi_boot_standard(col = "dist_from_grand_mean")

ggplot(bin_dists, aes(x = L1_code, y = mean, group = score_bin, fill = score_bin)) +
  geom_bar(stat = "identity", position = position_dodge() ) +
  xlab("Native language") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), position = position_dodge(.9)) +
  ylab("Euclidean distance from language centroid") +
  theme_minimal() +
  theme(text = element_text(size = 15),
          axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_fill_brewer(palette = "Set1", name = "Score") 

kable(bin_dists)
L1_code score_bin ci_lower ci_upper mean
ARA high 12.737239 23.40105 18.58388
ARA low 10.657336 21.26739 16.41873
CHI high 12.681038 23.56500 18.41718
CHI low 10.201051 21.53665 15.94971
FRE high 12.519551 24.37193 18.55082
FRE low 10.397365 21.28758 16.28284
GER high 12.806644 23.45367 18.16070
GER low 11.091125 21.81126 16.61737
HIN high 15.795136 24.28444 20.31462
HIN low 14.407386 24.02795 19.31501
ITA high 12.563921 22.84933 18.10858
ITA low 10.404202 21.08777 16.13474
JPN high 13.019992 23.36015 18.06883
JPN low 10.014038 20.52140 15.57402
KOR high 13.107354 23.26556 18.64588
KOR low 10.733554 20.95135 16.20982
SPA high 11.717962 23.14103 17.80769
SPA low 9.925172 20.90527 15.84656
TEL high 15.703953 24.83517 20.64367
TEL low 14.397329 22.97448 19.03851
TUR high 13.485145 23.02447 18.62233
TUR low 11.661252 21.54227 16.84659

Model

model_data <- tsne_dims_downsampled %>%
  left_join(grand_centroids) %>%
  unnest() %>%
  rowwise() %>%
  mutate(dist_from_grand_mean = 
           dist(rbind(c(center_tsne_X, center_tsne_Y),
                      c(tsne_X, tsne_Y)), method = "euclidean")[1])
model_data %>%
  group_by(L1_code) %>%
  do(tidy(lmer(dist_from_grand_mean ~  score_bin + (score_bin|prompt_id), data=.,control=lmerControl(optimizer = "bobyqa")))) %>%
  filter(term == "score_binlow") %>%
  arrange(statistic) %>%
  kable()
L1_code term estimate std.error statistic group
CHI score_binlow -2.522905 0.3440170 -7.333662 fixed
KOR score_binlow -2.260177 0.3341653 -6.763650 fixed
SPA score_binlow -1.868965 0.2981305 -6.268951 fixed
FRE score_binlow -2.212007 0.4367975 -5.064146 fixed
TEL score_binlow -1.500903 0.3191746 -4.702451 fixed
TUR score_binlow -1.712177 0.4142930 -4.132767 fixed
JPN score_binlow -2.221086 0.6025168 -3.686347 fixed
ITA score_binlow -1.739164 0.4896557 -3.551809 fixed
ARA score_binlow -2.205054 0.6458240 -3.414327 fixed
GER score_binlow -1.333625 0.4820483 -2.766580 fixed
HIN score_binlow -1.117017 0.4340396 -2.573536 fixed

Across all languages, lower scoring essays are closer to group mean.

4.1.0.1 Effect sizes

bin_dists_low <- distance_measures %>%
  group_by(L1_code, score_bin) %>%
  summarize(sd = sd(dist_from_grand_mean),
            mean = mean(dist_from_grand_mean),
            n = n()) %>%
filter(score_bin == "low") %>%
  rename(low_sd = sd,
         low_mean = mean,
         low_n = n)

bin_dists_high <- distance_measures %>%
  group_by(L1_code, score_bin) %>%
  summarize(sd = sd(dist_from_grand_mean),
            mean = mean(dist_from_grand_mean),
            n = n()) %>%
filter(score_bin == "high") %>%
  rename(high_sd = sd,
         high_mean = mean,
         high_n = n)

all_dists = inner_join(bin_dists_low, bin_dists_high, by = "L1_code")

effect_sizes = all_dists %>%
  rowwise()  %>%
  mutate(d = compute.es::mes( high_mean, low_mean, high_sd, low_sd,  high_n * 10, low_n * 10, verbose = FALSE)$d)

effect_sizes %>%
  select(L1_code, d) %>%
  arrange(-d) %>%
  kable()
L1_code d
JPN 0.31
KOR 0.31
CHI 0.29
ARA 0.26
FRE 0.26
ITA 0.24
SPA 0.23
TEL 0.23
TUR 0.23
GER 0.18
HIN 0.14

4.2 Variance

Diameter as the average distance to all other points:

tsne_diameters_LB_mean <- tsne_dims_downsampled %>%
  split(.$L1_score_prompt) %>% 
  map_df(get_group_diameter, "L1_score_prompt", "mean_dist")  %>%
  separate(L1_score_prompt, c("L1_code" ,"score_bin", "prompt_id"), sep = "_") 

tsne_diameters_LB_mean %>%
  group_by(score_bin, L1_code) %>%
  multi_boot_standard(col = "diameter", na.rm = T) %>%
  ggplot(aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  facet_wrap(~L1_code) +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Diameter, averaging across prompts") +
  ylab("mean diameter (average dist to all within-group points)") +
  theme_minimal() 

tsne_diameters_LB_mean_wide = tsne_diameters_LB_mean %>%
  filter(!is.na(diameter)) %>%
    spread(L1_code, diameter)

lapply(tsne_diameters_LB_mean_wide %>% select(-prompt_id, -score_bin), function(x) 
                   t.test(x ~ tsne_diameters_LB_mean_wide$score_bin, paired = TRUE)$statistic) %>%
  bind_cols() %>%
  gather("L1_code", "t.value") %>%
  arrange(t.value) %>%
  kable()
L1_code t.value
TEL -2.3056800
HIN -1.8555782
ARA -1.1745673
KOR -0.6999361
GER -0.5066852
SPA -0.4778886
TUR -0.3570271
ITA -0.2680945
CHI -0.2591910
JPN -0.2027548
FRE 0.3216314

There’s no effect here, except for a few languags.