Here, we look at the difference in the distributions of low and high scoring esssays along two measures, mean and variance. We do two versions of this analysis: (1) Pooling across all languages, and (2) By language. For each, we first average across prompt.

Note that the exact numbers are dependent on the tSNE dimensions (should probably average across models).


Read in doctag indices, docvecs, and metadata

doctag_indices <- read_feather("../all_data/doctag_indices.feather") 
docvecs <- read_feather("../all_data/docvecs.feather") %>%
  as.data.frame() %>%
  mutate(offset = 0:(n()-1)) %>%
  select(offset, everything())

#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../all_data/merged_metadata.csv") 
metadata_clean <- metadata %>%
  mutate_if(is.character, as.factor)  %>%
  mutate(essay_id = as.character(essay_id)) 

Score summaries

Set low vs. high bin score cutoff.

CUTOFF <- 3.5
  #median(metadata_clean$score, na.rm = T)
ggplot(metadata_clean, aes(x = score)) +
  geom_histogram(binwidth = .5) +
  geom_vline(aes(xintercept = CUTOFF), color = "red") +
  ggtitle("Score distribution") +
  theme_minimal()

n_scores_not_na <- metadata_clean %>% 
  filter(!is.na(score)) %>% 
  nrow()

metadata_clean %>%
  count(L1_code)
## # A tibble: 11 x 2
##    L1_code     n
##     <fctr> <int>
##  1     ARA  1100
##  2     CHI  1100
##  3     FRE  1100
##  4     GER  1100
##  5     HIN  1100
##  6     ITA  1100
##  7     JPN  1100
##  8     KOR  1100
##  9     SPA  1100
## 10     TEL  1100
## 11     TUR  1100

Of all essays, we have scores for 91% of essays.

Merge all data sources together

d <- doctag_indices %>%
  left_join(metadata_clean) %>%
  left_join(docvecs) %>%
  nest(-1:-10, .key = "doc_vector") %>%
  select(-doc_count, -offset, -test_center_country_code) %>%
  mutate(prompt_id = fct_recode(prompt_id, 
                                broad.academics = "VC079857", 
                                young.enjoy = "VC139555", 
                                young.help = "VC182427",
                                advertisements = "VC199494",
                                fewer.cars = "VC199744",
                                tour.travel = "VC212639",
                                students.facts = "VC247638",
                                success.risks = "VC251653"))

Get TSNE coordinates. These are the same for both the all-languages and by-language analyses.

#mats <- d %>%
#  select(doc_vector) %>%
#  unnest() %>%
#  as.matrix()

#tsne_out = Rtsne::Rtsne(mats)
#tsne_dims <- tsne_out$Y %>%
#  as.data.frame() %>%
#  rename(tsne_X = V1,
#         tsne_Y = V2) %>%
#  bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
#  select(everything(), tsne_X, tsne_Y)
# write_csv(tsne_dims, "../all_data/tsne_dims_cached.csv")
tsne_dims <- read_csv("../all_data/tsne_dims_cached.csv")

1 All Languages

Get score bins

tsne_AL_prompt_bins <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  filter(!is.na(score)) %>%
  group_by(prompt_id) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
         prompt_score = paste(prompt_id, score_bin, sep = "_")) %>%
         #score_bin = ntile(score, 2)
  ungroup()

tsne_AL_prompt_bins %>%
  group_by(prompt_id, score_bin) %>%
  summarize(n = n(),
            mean = mean(score)) %>%
  kable(caption = "Bin x prompt summary")
Bin x prompt summary
prompt_id score_bin n mean
advertisements high 681 4.049192
advertisements low 670 2.538060
broad.academics high 822 4.077859
broad.academics low 699 2.607296
fewer.cars high 855 4.089474
fewer.cars low 681 2.621880
students.facts high 829 4.038601
students.facts low 721 2.601942
success.risks high 765 4.005229
success.risks low 763 2.617955
tour.travel high 456 4.080044
tour.travel low 390 2.578205
young.enjoy high 809 4.038319
young.enjoy low 610 2.675410
young.help high 660 4.072727
young.help low 589 2.558574

1.1 Mean

get_group_centroid <- function(doc_vecs_df, group_name){
  centroid <- doc_vecs_df %>%
    select(doc_vector) %>%
    unnest(doc_vector) %>%
    colMeans() 

  this_group_name  <- doc_vecs_df %>%
    select_(group_name) %>%
    slice(1) %>%
    unlist()

  df <- data.frame(t(centroid)) %>%
    mutate(x = this_group_name) %>%
    select(x, everything())
  
   names(df)[1] = c(as.name(group_name))
   df
}

# prompt centers
tsne_centroids_AL_prompt <- tsne_AL_prompt_bins %>%
  split(.$prompt_id) %>% 
  map_df(get_group_centroid, "prompt_id") %>%
  rename(tsne_X_center = tsne_X,
         tsne_Y_center = tsne_Y)

# prompt by score_bin centers
tsne_centroids_AL_prompt_bin <- tsne_AL_prompt_bins %>%
  split(.$prompt_score) %>% 
  map_df(get_group_centroid, "prompt_score") %>%
  separate(prompt_score, c("prompt_id", "score_bin"), sep = "_") %>%
  left_join(tsne_centroids_AL_prompt)

Plot with mean of each prompt shown as black “x”, with low and high scoring mean for each prompt shown in green and pink.

ggplot(tsne_centroids_AL_prompt_bin) +
  geom_point(aes(color = score_bin, x = tsne_X, y = tsne_Y), size = 1.5) +
  geom_point(aes(x = tsne_X_center, y = tsne_Y_center), shape = 4, color = "black") +
  ggtitle("Mean by prompt and score bin") +
  #geom_text_repel(aes(label = prompt_id, x = tsne_X_center, y = tsne_Y_center)) +
  theme_minimal()

Low scoring essays tend to be further away from the prompt mean (centroid). Note, however, that there can be many more essays in the “high” bin relative to the “low” bin, depending on how bins are split.

Here is a random effect model predicting distance from prompt mean with score, with prompt_id and L1_code as random effects (by slope and intercept).

tsne_dims_AL_prompt_bin <- tsne_dims %>%
  left_join(tsne_centroids_AL_prompt) %>%
  rowwise() %>%
  mutate(dist_from_prompt_mean = dist(rbind(c(tsne_X_center,
                                                   tsne_Y_center),
                                                 c(tsne_X, tsne_Y)), 
                                           method = "euclidean")[1])

AL_mean_mod = lmer(dist_from_prompt_mean ~  score + (score|prompt_id) + (score|L1_code), tsne_dims_AL_prompt_bin)
summary(AL_mean_mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: dist_from_prompt_mean ~ score + (score | prompt_id) + (score |  
##     L1_code)
##    Data: tsne_dims_AL_prompt_bin
## 
## REML criterion at convergence: 51805.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.9921 -0.5468 -0.0648  0.3809 15.1688 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev. Corr 
##  L1_code   (Intercept) 0.51954  0.7208        
##            score       0.06206  0.2491   -0.86
##  prompt_id (Intercept) 0.57750  0.7599        
##            score       0.03156  0.1777   -0.78
##  Residual              6.44028  2.5378        
## Number of obs: 11000, groups:  L1_code, 11; prompt_id, 8
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   4.6779     0.3609  12.963
## score        -0.2347     0.1023  -2.295
## 
## Correlation of Fixed Effects:
##       (Intr)
## score -0.815

There is a reliable effect of score: Lower scoring essays are further from the group mean.

1.2 Variance

Here are scatter and 2D density plots for each prompt and score bin.

tsne_AL_prompt_bins <- tsne_dims %>%
  filter(!is.na(score)) %>%
  group_by(prompt_id) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high")) %>%
         #score_bin = ntile(score, 2)
  ungroup()

ggplot(tsne_AL_prompt_bins, aes(color = score_bin, x = tsne_X, y = tsne_Y)) +
  facet_wrap(~prompt_id) + 
  geom_point(size = .2, alpha = .3) +
  theme_minimal()

ggplot(tsne_AL_prompt_bins, aes(color = score_bin, x = tsne_X, y = tsne_Y)) +
  facet_wrap(~prompt_id) + 
  geom_density2d(alpha = .5) + 
  theme_minimal()

Overall, it looks like there are a more outliers for the low bin.

Here we measure variance as the max distance between two essays in each bin.

tsne_diameters_LA_max <- tsne_dims %>%
  filter(!is.na(score)) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
         prompt_score = paste(prompt_id, score_bin, sep = "_")) %>%
  nest(1:2, .key = "doc_vector") %>%
  split(.$prompt_score) %>% 
  map_df(get_group_diameter, "prompt_score", "max")  %>%
  separate(prompt_score, c("prompt_id", "score_bin"), sep = "_")

tsne_diameters_LA_max %>%
  group_by(score_bin) %>%
  multi_boot_standard(col = "diameter") %>%
  ggplot(aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Diameter, averaging across prompts") +
  ylab("mean diameter (max within-group dist)") +
  theme_minimal() 

Ranges are bootstrapped 95% CIs. Averaging across prompts, there is no difference betwen high and low scoring bins in terms of their variance. Is there a better measure of variance here?

What about diameter as the average distance to all other points:

tsne_diameters_LA_mean <- tsne_dims %>%
  filter(!is.na(score)) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
         prompt_score = paste(prompt_id, score_bin, sep = "_")) %>%
  nest(1:2, .key = "doc_vector") %>%
  split(.$prompt_score) %>% 
  map_df(get_group_diameter, "prompt_score", "mean_dist")  %>%
  separate(prompt_score, c("prompt_id", "score_bin"), sep = "_")

tsne_diameters_LA_mean %>%
  group_by(score_bin) %>%
  multi_boot_standard(col = "diameter") %>%
  ggplot(aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Diameter, averaging across prompts") +
  ylab("mean diameter (average dist to all within-group points)") +
  theme_minimal() 

With this measure, there is a trend for low scoring essays to have more variance.

low = filter(tsne_diameters_LA_mean, score_bin == "low")
high = filter(tsne_diameters_LA_mean, score_bin == "high")

t.test(low$diameter, high$diameter, paired = TRUE)
## 
##  Paired t-test
## 
## data:  low$diameter and high$diameter
## t = 2.5956, df = 7, p-value = 0.03565
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.04046505 0.86901229
## sample estimates:
## mean of the differences 
##               0.4547387

In a paired t-test, the difference between score bins is reliable (depending on how you create the bins).

2 By Language

Get score bins

tsne_BL_prompt_bins <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  filter(!is.na(score)) %>%
  group_by(prompt_id, L1_code) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
         prompt_score_L1 = paste(prompt_id, score_bin, L1_code, sep = "_"),
         prompt_L1 = paste(prompt_id, L1_code, sep = "_")) %>%
         #score_bin = ntile(score, 2)
  ungroup()

#tsne_BL_prompt_bins %>%
#  group_by(L1_code, prompt_id, score_bin) %>%
#  summarize(n = n(),
#            mean = mean(score)) %>%
#  kable(caption = "Bin x prompt summary")

2.1 Mean

# prompt centers
tsne_centroids_BL_prompt <- tsne_BL_prompt_bins %>%
  split(.$prompt_L1) %>% 
  map_df(get_group_centroid, "prompt_L1") %>%
  rename(tsne_X_center = tsne_X,
         tsne_Y_center = tsne_Y) %>%
  separate(prompt_L1, c("prompt_id", "L1_code"), sep = "_") 

# prompt by score_bin centers
tsne_centroids_BL_prompt_bin <- tsne_BL_prompt_bins %>%
  split(.$prompt_score_L1) %>% 
  map_df(get_group_centroid, "prompt_score_L1") %>%
  separate(prompt_score_L1, c("prompt_id", "score_bin", "L1_code"), sep = "_") %>%
  left_join(tsne_centroids_BL_prompt)

Plot with mean of each prompt shown as black “x”, with low and high scoring mean for each prompt shown in green and pink. Each language is a facet

ggplot(tsne_centroids_BL_prompt_bin) +
  geom_point(aes(color = score_bin, x = tsne_X, y = tsne_Y), size = 1) +
  geom_point(aes(x = tsne_X_center, y = tsne_Y_center), shape = 4, color = "black") +
  facet_wrap(~L1_code)+
  ggtitle("Mean by prompt, score bin and language") +
  theme_minimal()

We fit a mixed-effect model predicting distance from prompt-language centroid for each language. Slopes by prompt are included as random effects.

tsne_dims_BL_prompt_bin <- tsne_dims %>%
  left_join(tsne_centroids_BL_prompt) %>%
  rowwise() %>%
  mutate(dist_from_prompt_mean = dist(rbind(c(tsne_X_center,
                                                   tsne_Y_center),
                                                 c(tsne_X, tsne_Y)), 
                                           method = "euclidean")[1])

tsne_dims_BL_prompt_bin %>%
  group_by(L1_code) %>%
  do(tidy(lmer(dist_from_prompt_mean ~  score + (score|prompt_id), data=.))) %>%
  filter(term == "score") %>%
  arrange(statistic) %>%
  kable()
L1_code term estimate std.error statistic group
HIN score -0.6080019 0.0893076 -6.8079517 fixed
TEL score -0.5172388 0.0930001 -5.5617017 fixed
TUR score -0.2506226 0.1641483 -1.5268063 fixed
CHI score -0.3083662 0.2062118 -1.4953859 fixed
KOR score -0.1485492 0.2310076 -0.6430489 fixed
GER score -0.0789303 0.1467492 -0.5378584 fixed
FRE score -0.0196478 0.2101347 -0.0935008 fixed
ITA score -0.0020177 0.1189763 -0.0169585 fixed
JPN score 0.0109130 0.2643309 0.0412853 fixed
SPA score 0.0082294 0.0897698 0.0916723 fixed
ARA score 0.0499715 0.1614033 0.3096064 fixed

The effect of score on mean distance only holds for some languages.

2.2 Variance

Diameter as the average distance to all other points:

tsne_diameters_LB_mean <- tsne_dims %>%
  filter(!is.na(score)) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
         prompt_score_L1 = paste(prompt_id, score_bin, L1_code, sep = "_")) %>%
  nest(1:2, .key = "doc_vector") %>%
  split(.$prompt_score_L1) %>% 
  map_df(get_group_diameter, "prompt_score_L1", "mean_dist")  %>%
  separate(prompt_score_L1, c("prompt_id", "score_bin", "L1_code"), sep = "_") 


tsne_diameters_LB_mean %>%
  group_by(score_bin, L1_code) %>%
  multi_boot_standard(col = "diameter", na.rm = T) %>%
  ggplot(aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  facet_wrap(~L1_code) +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Diameter, averaging across prompts") +
  ylab("mean diameter (average dist to all within-group points)") +
  theme_minimal() 

tsne_diameters_LB_mean_wide = tsne_diameters_LB_mean %>%
  filter(!is.na(diameter)) %>%
    spread(L1_code, diameter)

lapply(tsne_diameters_LB_mean_wide %>% select(-prompt_id, -score_bin), function(x) 
                   t.test(x ~ tsne_diameters_LB_mean_wide$score_bin, paired = TRUE)$statistic) %>%
  bind_cols() %>%
  gather("L1_code", "t.value") %>%
  arrange(t.value) %>%
  kable()

With this measure, there is a trend for low scoring essays to have more variance (averaging across prompts).