Here we look at the difference in the distributions of low and high scoring esssays along two measures, mean and variance. We do two versions of this analysis: (A) Pooling across all languages, and (B) By language. For each, we downsample so that there are the same number in each lang x prompt x score bin.

SUMMARY: Lower scoring languages are closer to the mean (both pooling across languages and by language). More variance for lower scoring essays, but this is driven by just 1 or 2 languages

Parameters

CUTOFF <- 4
N_PER_L_SCORE_PROMPT <- 50
MIN_SCORE <- 2

1 Data

Read in doctag indices, docvecs, and metadata

doctag_indices <- read_feather("../../all_data/doctag_indices.feather") 
docvecs <- read_feather("../../all_data/docvecs.feather") %>%
  as.data.frame() %>%
  mutate(offset = 0:(n()-1)) %>%
  select(offset, everything())

#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../../all_data/raw/merged_metadata.csv") 
metadata_clean <- metadata %>%
  mutate_if(is.character, as.factor)  %>%
  mutate(essay_id = as.character(essay_id))

n_scores_not_na <- metadata_clean %>% 
  filter(!is.na(score)) %>% 
  nrow()

Of all essays, we have scores for 91% of essays.

Merge all data sources together

d <- doctag_indices %>%
  left_join(metadata_clean) %>%
  left_join(docvecs) %>%
  nest(-1:-10, .key = "doc_vector") %>%
  select(-doc_count, -offset, -test_center_country_code) %>%
  mutate(prompt_id = fct_recode(prompt_id, 
                                broad.academics = "VC079857", 
                                young.enjoy = "VC139555", 
                                young.help = "VC182427",
                                advertisements = "VC199494",
                                fewer.cars = "VC199744",
                                tour.travel = "VC212639",
                                students.facts = "VC247638",
                                success.risks = "VC251653"))

Get TSNE coordinates. These are the same for both the all-languages and by-language analyses.

#mats <- d %>%
#  select(doc_vector) %>%
#  unnest() %>%
#  as.matrix()

#tsne_out = Rtsne::Rtsne(mats)
#tsne_dims <- tsne_out$Y %>%
#  as.data.frame() %>%
#  rename(tsne_X = V1,
#         tsne_Y = V2) %>%
#  bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
#  select(everything(), tsne_X, tsne_Y)
# write_csv(tsne_dims, "../all_data/tsne_dims_cached.csv")
tsne_dims <- read_csv("../../all_data/tsne_dims_cached.csv")

2 Downsample

Scores are unevenly distributed across languages; need to downsample. Here were doing 50 per language, score_bin and prompt.

ggplot(tsne_dims) +
  geom_histogram(aes(x = score, fill = L1_code), binwidth = .5) +
  facet_wrap(~L1_code) +
  theme_bw() +
  theme(legend.position = "none")

Downsample essays.

tsne_dims_downsampled <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  filter(score > MIN_SCORE) %>%
  filter(!is.na(score)) %>%
  group_by(L1_code, score, prompt_id) %>%
  sample_n(N_PER_L_SCORE_PROMPT, replace = T) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
         score_prompt = paste(score_bin, prompt_id, sep = "_"),
         L1_score_prompt = paste(L1_code, score_bin, prompt_id, sep = "_"),
         L1_prompt = paste(L1_code, prompt_id, sep = "_")) %>%
  ungroup()

The bins:

tsne_dims_downsampled %>%
  group_by(score_bin) %>%
  distinct(score)

## # A tibble: 6 x 2
## # Groups:   score_bin [2]
##   score score_bin
##   <dbl>     <chr>
## 1   2.5       low
## 2   3.0       low
## 3   3.5       low
## 4   4.0      high
## 5   4.5      high
## 6   5.0      high

3 All Languages

3.1 Mean

Centroid = grand center

get_group_centroid <- function(doc_vecs_df, group_name){
  centroid <- doc_vecs_df %>%
    select(doc_vector) %>%
    unnest(doc_vector) %>%
    colMeans() 

  this_group_name  <- doc_vecs_df %>%
    select_(group_name) %>%
    slice(1) %>%
    unlist()

  df <- data.frame(t(centroid)) %>%
    mutate(x = this_group_name) %>%
    select(x, everything())
  
   names(df)[1] = c(as.name(group_name))
   df
}

# L1_score_prompt centers
tsne_centroids_BL_score_prompt <- tsne_dims_downsampled %>%
  split(.$L1_score_prompt) %>%
  map_df(get_group_centroid, "L1_score_prompt") %>%
  separate(L1_score_prompt, c("L1_code", "score_bin", "prompt_id"), "_") %>%
  nest(4:5, .key = "doc_vector")

grand_centroids <- tsne_centroids_BL_score_prompt %>% # SPL
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% # 
  map_df(get_group_centroid, "L1_prompt") %>% # PL
  separate(L1_prompt, c("L1_code", "prompt_id"), "_")  %>%
  group_by(prompt_id) %>%
  summarize(tsne_X = mean(tsne_X), #P
            tsne_Y = mean(tsne_Y)) %>%
  mutate(group = "all") %>%
  group_by(group) %>%
  summarize(center_tsne_X = mean(tsne_X), # 1
            center_tsne_Y = mean(tsne_Y))

group_centroids <- tsne_centroids_BL_score_prompt %>% # SPL
  mutate(prompt_score = paste0(prompt_id, "_" , score_bin)) %>%
  split(.$prompt_score) %>%
  map_df(get_group_centroid, "prompt_score")  %>% # SP
  mutate(group = "all") %>%
  separate(prompt_score, c("prompt_id", "score_bin"), "_") 
  
distance_measures <- group_centroids %>%
  left_join(grand_centroids) %>%
  rowwise() %>%
  mutate(dist_from_grand_mean = dist(rbind(c(center_tsne_X,
                                                center_tsne_Y),
                                              c(tsne_X, tsne_Y)), method = "euclidean")[1])

bin_dists <- distance_measures %>%
  group_by(score_bin) %>%
  multi_boot_standard(col = "dist_from_grand_mean")

kable(bin_dists)

score_bin	ci_lower	ci_upper	mean
high	13.24068	23.42909	18.66798
low	11.13162	21.34514	16.64381

ggplot(bin_dists, aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Distance from grand centroid (bootstrapping across prompts)") +
  ylab("distance from grand centroid") +
  theme_minimal()

Model

model_data <- tsne_dims_downsampled %>%
  mutate(group = "all") %>%
  left_join(grand_centroids) %>%
  unnest() %>%
  rowwise() %>%
  mutate(dist_from_grand_mean = 
           dist(rbind(c(center_tsne_X, center_tsne_Y),
                      c(tsne_X, tsne_Y)), method = "euclidean")[1])

# these don't converge
#model <- lmer(dist_from_grand_mean ~  score_bin + (score_bin|prompt_id) + (prompt_id|L1_code) +  (score_bin|L1_code), data= model_data,  control=lmerControl(optimizer = "bobyqa"))

#model <- lmer(dist_from_grand_mean ~  score_bin + (score_bin|prompt_id) + (prompt_id|L1_code) , data= model_data,  control=lmerControl(optimizer = "bobyqa"))

#model <- lmer(dist_from_grand_mean ~  score_bin +  (prompt_id|L1_code) , data= model_data,  control=lmerControl(optimizer = "bobyqa"))

model <- lmer(dist_from_grand_mean ~  score_bin +  (1|L1_code) + (1|prompt_id) , data= model_data,  control=lmerControl(optimizer = "bobyqa"))
summary(model)

## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## dist_from_grand_mean ~ score_bin + (1 | L1_code) + (1 | prompt_id)
##    Data: model_data
## Control: lmerControl(optimizer = "bobyqa")
## 
## REML criterion at convergence: 25523
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -5.1360 -0.6089  0.0694  0.7035  4.7005 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  L1_code   (Intercept)  0.8699  0.9327  
##  prompt_id (Intercept) 54.2244  7.3637  
##  Residual               7.5540  2.7485  
## Number of obs: 5230, groups:  L1_code, 11; prompt_id, 8
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  19.16907    2.61916   7.319
## score_binlow -1.89676    0.07603 -24.949
## 
## Correlation of Fixed Effects:
##             (Intr)
## score_binlw -0.014

Lowering scoring languages are farther away from the overall group mean.

3.2 Variance

Here are scatter and 2D density plots for each prompt and score bin.

ggplot(tsne_dims_downsampled %>% unnest(), aes(color = score_bin, x = tsne_X, y = tsne_Y)) +
  facet_wrap(~prompt_id) + 
  geom_point(size = .2, alpha = .3) +
  theme_minimal()

Averaging across prompts, there is no difference betwen high and low scoring bins in terms of their variance when measured as max distance. Here is diameter as the average distance to all other points:

tsne_diameters_LA_mean <- tsne_dims_downsampled %>%
  split(.$score_prompt) %>% 
  map_df(get_group_diameter, "score_prompt", "mean_dist")  %>%
  separate(score_prompt, c("score_bin","prompt_id"), sep = "_")

tsne_diameters_LA_mean %>%
  group_by(score_bin) %>%
  multi_boot_standard(col = "diameter") %>%
  ggplot(aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Diameter, averaging across prompts") +
  ylab("mean diameter (average dist to all within-group points)") +
  theme_minimal()

With this measure, there is a trend for low scoring essays to have more variance.

low = filter(tsne_diameters_LA_mean, score_bin == "low")
high = filter(tsne_diameters_LA_mean, score_bin == "high")

t.test(low$diameter, high$diameter, paired = TRUE)

## 
##  Paired t-test
## 
## data:  low$diameter and high$diameter
## t = 2.6818, df = 7, p-value = 0.03145
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.05742617 0.91354893
## sample estimates:
## mean of the differences 
##               0.4854876

In a paired t-test, the difference between score bins is reliable But, this trend is driven by Arabic, Telugu and Hindi (see below).

4 By Language

4.1 Mean

By languages - Language centroid

grand_centroids <- tsne_centroids_BL_score_prompt %>% # SPL
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>%
  map_df(get_group_centroid, "L1_prompt") %>% # PL
  separate(L1_prompt, c("L1_code",  "prompt_id"), "_") %>%
  nest(3:4, .key = "doc_vector") %>%
  split(.$L1_code) %>%
  map_df(get_group_centroid, "L1_code")  %>%
  rename(center_tsne_X = tsne_X,
        center_tsne_Y = tsne_Y)

distance_measures <- tsne_centroids_BL_score_prompt %>%
  unnest() %>%
  left_join(grand_centroids) %>%
  rowwise () %>%
  mutate(dist_from_grand_mean = 
           dist(rbind(c(center_tsne_X, center_tsne_Y),
                      c(tsne_X, tsne_Y)), method = "euclidean")[1])

bin_dists <- distance_measures %>%
  group_by(L1_code, score_bin) %>%
  multi_boot_standard(col = "dist_from_grand_mean")

ggplot(bin_dists, aes(x = L1_code, y = mean, group = score_bin, fill = score_bin)) +
  geom_bar(stat = "identity", position = position_dodge() ) +
  xlab("Native language") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), position = position_dodge(.9)) +
  ylab("Euclidean distance from language centroid") +
  theme_minimal() +
  theme(text = element_text(size = 15),
          axis.text.x = element_text(angle = 90, hjust = 1)) +
  scale_fill_brewer(palette = "Set1", name = "Score")

kable(bin_dists)

L1_code	score_bin	ci_lower	ci_upper	mean
ARA	high	12.737239	23.40105	18.58388
ARA	low	10.657336	21.26739	16.41873
CHI	high	12.681038	23.56500	18.41718
CHI	low	10.201051	21.53665	15.94971
FRE	high	12.519551	24.37193	18.55082
FRE	low	10.397365	21.28758	16.28284
GER	high	12.806644	23.45367	18.16070
GER	low	11.091125	21.81126	16.61737
HIN	high	15.795136	24.28444	20.31462
HIN	low	14.407386	24.02795	19.31501
ITA	high	12.563921	22.84933	18.10858
ITA	low	10.404202	21.08777	16.13474
JPN	high	13.019992	23.36015	18.06883
JPN	low	10.014038	20.52140	15.57402
KOR	high	13.107354	23.26556	18.64588
KOR	low	10.733554	20.95135	16.20982
SPA	high	11.717962	23.14103	17.80769
SPA	low	9.925172	20.90527	15.84656
TEL	high	15.703953	24.83517	20.64367
TEL	low	14.397329	22.97448	19.03851
TUR	high	13.485145	23.02447	18.62233
TUR	low	11.661252	21.54227	16.84659

Model

model_data <- tsne_dims_downsampled %>%
  left_join(grand_centroids) %>%
  unnest() %>%
  rowwise() %>%
  mutate(dist_from_grand_mean = 
           dist(rbind(c(center_tsne_X, center_tsne_Y),
                      c(tsne_X, tsne_Y)), method = "euclidean")[1])
model_data %>%
  group_by(L1_code) %>%
  do(tidy(lmer(dist_from_grand_mean ~  score_bin + (score_bin|prompt_id), data=.,control=lmerControl(optimizer = "bobyqa")))) %>%
  filter(term == "score_binlow") %>%
  arrange(statistic) %>%
  kable()

L1_code	term	estimate	std.error	statistic	group
CHI	score_binlow	-2.522905	0.3440170	-7.333662	fixed
KOR	score_binlow	-2.260177	0.3341653	-6.763650	fixed
SPA	score_binlow	-1.868965	0.2981305	-6.268951	fixed
FRE	score_binlow	-2.212007	0.4367975	-5.064146	fixed
TEL	score_binlow	-1.500903	0.3191746	-4.702451	fixed
TUR	score_binlow	-1.712177	0.4142930	-4.132767	fixed
JPN	score_binlow	-2.221086	0.6025168	-3.686347	fixed
ITA	score_binlow	-1.739164	0.4896557	-3.551809	fixed
ARA	score_binlow	-2.205054	0.6458240	-3.414327	fixed
GER	score_binlow	-1.333625	0.4820483	-2.766580	fixed
HIN	score_binlow	-1.117017	0.4340396	-2.573536	fixed

Across all languages, lower scoring essays are closer to group mean.

4.1.0.1 Effect sizes

bin_dists_low <- distance_measures %>%
  group_by(L1_code, score_bin) %>%
  summarize(sd = sd(dist_from_grand_mean),
            mean = mean(dist_from_grand_mean),
            n = n()) %>%
filter(score_bin == "low") %>%
  rename(low_sd = sd,
         low_mean = mean,
         low_n = n)

bin_dists_high <- distance_measures %>%
  group_by(L1_code, score_bin) %>%
  summarize(sd = sd(dist_from_grand_mean),
            mean = mean(dist_from_grand_mean),
            n = n()) %>%
filter(score_bin == "high") %>%
  rename(high_sd = sd,
         high_mean = mean,
         high_n = n)

all_dists = inner_join(bin_dists_low, bin_dists_high, by = "L1_code")

effect_sizes = all_dists %>%
  rowwise()  %>%
  mutate(d = compute.es::mes( high_mean, low_mean, high_sd, low_sd,  high_n * 10, low_n * 10, verbose = FALSE)$d)

effect_sizes %>%
  select(L1_code, d) %>%
  arrange(-d) %>%
  kable()

L1_code	d
JPN	0.31
KOR	0.31
CHI	0.29
ARA	0.26
FRE	0.26
ITA	0.24
SPA	0.23
TEL	0.23
TUR	0.23
GER	0.18
HIN	0.14

4.2 Variance

Diameter as the average distance to all other points:

tsne_diameters_LB_mean <- tsne_dims_downsampled %>%
  split(.$L1_score_prompt) %>% 
  map_df(get_group_diameter, "L1_score_prompt", "mean_dist")  %>%
  separate(L1_score_prompt, c("L1_code" ,"score_bin", "prompt_id"), sep = "_") 

tsne_diameters_LB_mean %>%
  group_by(score_bin, L1_code) %>%
  multi_boot_standard(col = "diameter", na.rm = T) %>%
  ggplot(aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  facet_wrap(~L1_code) +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Diameter, averaging across prompts") +
  ylab("mean diameter (average dist to all within-group points)") +
  theme_minimal()

tsne_diameters_LB_mean_wide = tsne_diameters_LB_mean %>%
  filter(!is.na(diameter)) %>%
    spread(L1_code, diameter)

lapply(tsne_diameters_LB_mean_wide %>% select(-prompt_id, -score_bin), function(x) 
                   t.test(x ~ tsne_diameters_LB_mean_wide$score_bin, paired = TRUE)$statistic) %>%
  bind_cols() %>%
  gather("L1_code", "t.value") %>%
  arrange(t.value) %>%
  kable()

L1_code	t.value
TEL	-2.3056800
HIN	-1.8555782
ARA	-1.1745673
KOR	-0.6999361
GER	-0.5066852
SPA	-0.4778886
TUR	-0.3570271
ITA	-0.2680945
CHI	-0.2591910
JPN	-0.2027548
FRE	0.3216314

There’s no effect here, except for a few languags.

L2ETS Study 1 analyses - Scores

Molly Lewis

2017-12-04