Here, we look at the difference in the distributions of low and high scoring esssays along two measures, mean and variance. We do two versions of this analysis: (1) Pooling across all languages, and (2) By language. For each, we first average across prompt.

Note that the exact numbers are dependent on the tSNE dimensions (should probably average across models).

Read in doctag indices, docvecs, and metadata

doctag_indices <- read_feather("../all_data/doctag_indices.feather") 
docvecs <- read_feather("../all_data/docvecs.feather") %>%
  as.data.frame() %>%
  mutate(offset = 0:(n()-1)) %>%
  select(offset, everything())

#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../all_data/merged_metadata.csv") 
metadata_clean <- metadata %>%
  mutate_if(is.character, as.factor)  %>%
  mutate(essay_id = as.character(essay_id))

Score summaries

Set low vs. high bin score cutoff.

CUTOFF <- 3.5
  #median(metadata_clean$score, na.rm = T)

ggplot(metadata_clean, aes(x = score)) +
  geom_histogram(binwidth = .5) +
  geom_vline(aes(xintercept = CUTOFF), color = "red") +
  ggtitle("Score distribution") +
  theme_minimal()

n_scores_not_na <- metadata_clean %>% 
  filter(!is.na(score)) %>% 
  nrow()

metadata_clean %>%
  count(L1_code)

## # A tibble: 11 x 2
##    L1_code     n
##     <fctr> <int>
##  1     ARA  1100
##  2     CHI  1100
##  3     FRE  1100
##  4     GER  1100
##  5     HIN  1100
##  6     ITA  1100
##  7     JPN  1100
##  8     KOR  1100
##  9     SPA  1100
## 10     TEL  1100
## 11     TUR  1100

Of all essays, we have scores for 91% of essays.

Merge all data sources together

d <- doctag_indices %>%
  left_join(metadata_clean) %>%
  left_join(docvecs) %>%
  nest(-1:-10, .key = "doc_vector") %>%
  select(-doc_count, -offset, -test_center_country_code) %>%
  mutate(prompt_id = fct_recode(prompt_id, 
                                broad.academics = "VC079857", 
                                young.enjoy = "VC139555", 
                                young.help = "VC182427",
                                advertisements = "VC199494",
                                fewer.cars = "VC199744",
                                tour.travel = "VC212639",
                                students.facts = "VC247638",
                                success.risks = "VC251653"))

Get TSNE coordinates. These are the same for both the all-languages and by-language analyses.

#mats <- d %>%
#  select(doc_vector) %>%
#  unnest() %>%
#  as.matrix()

#tsne_out = Rtsne::Rtsne(mats)
#tsne_dims <- tsne_out$Y %>%
#  as.data.frame() %>%
#  rename(tsne_X = V1,
#         tsne_Y = V2) %>%
#  bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
#  select(everything(), tsne_X, tsne_Y)
# write_csv(tsne_dims, "../all_data/tsne_dims_cached.csv")
tsne_dims <- read_csv("../all_data/tsne_dims_cached.csv")

1 All Languages

Get score bins

tsne_AL_prompt_bins <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  filter(!is.na(score)) %>%
  group_by(prompt_id) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
         prompt_score = paste(prompt_id, score_bin, sep = "_")) %>%
         #score_bin = ntile(score, 2)
  ungroup()

tsne_AL_prompt_bins %>%
  group_by(prompt_id, score_bin) %>%
  summarize(n = n(),
            mean = mean(score)) %>%
  kable(caption = "Bin x prompt summary")

Bin x prompt summary
prompt_id	score_bin	n	mean
advertisements	high	681	4.049192
advertisements	low	670	2.538060
broad.academics	high	822	4.077859
broad.academics	low	699	2.607296
fewer.cars	high	855	4.089474
fewer.cars	low	681	2.621880
students.facts	high	829	4.038601
students.facts	low	721	2.601942
success.risks	high	765	4.005229
success.risks	low	763	2.617955
tour.travel	high	456	4.080044
tour.travel	low	390	2.578205
young.enjoy	high	809	4.038319
young.enjoy	low	610	2.675410
young.help	high	660	4.072727
young.help	low	589	2.558574

1.1 Mean

get_group_centroid <- function(doc_vecs_df, group_name){
  centroid <- doc_vecs_df %>%
    select(doc_vector) %>%
    unnest(doc_vector) %>%
    colMeans() 

  this_group_name  <- doc_vecs_df %>%
    select_(group_name) %>%
    slice(1) %>%
    unlist()

  df <- data.frame(t(centroid)) %>%
    mutate(x = this_group_name) %>%
    select(x, everything())
  
   names(df)[1] = c(as.name(group_name))
   df
}

# prompt centers
tsne_centroids_AL_prompt <- tsne_AL_prompt_bins %>%
  split(.$prompt_id) %>% 
  map_df(get_group_centroid, "prompt_id") %>%
  rename(tsne_X_center = tsne_X,
         tsne_Y_center = tsne_Y)

# prompt by score_bin centers
tsne_centroids_AL_prompt_bin <- tsne_AL_prompt_bins %>%
  split(.$prompt_score) %>% 
  map_df(get_group_centroid, "prompt_score") %>%
  separate(prompt_score, c("prompt_id", "score_bin"), sep = "_") %>%
  left_join(tsne_centroids_AL_prompt)

Plot with mean of each prompt shown as black “x”, with low and high scoring mean for each prompt shown in green and pink.

ggplot(tsne_centroids_AL_prompt_bin) +
  geom_point(aes(color = score_bin, x = tsne_X, y = tsne_Y), size = 1.5) +
  geom_point(aes(x = tsne_X_center, y = tsne_Y_center), shape = 4, color = "black") +
  ggtitle("Mean by prompt and score bin") +
  #geom_text_repel(aes(label = prompt_id, x = tsne_X_center, y = tsne_Y_center)) +
  theme_minimal()

Low scoring essays tend to be further away from the prompt mean (centroid). Note, however, that there can be many more essays in the “high” bin relative to the “low” bin, depending on how bins are split.

Here is a random effect model predicting distance from prompt mean with score, with prompt_id and L1_code as random effects (by slope and intercept).

tsne_dims_AL_prompt_bin <- tsne_dims %>%
  left_join(tsne_centroids_AL_prompt) %>%
  rowwise() %>%
  mutate(dist_from_prompt_mean = dist(rbind(c(tsne_X_center,
                                                   tsne_Y_center),
                                                 c(tsne_X, tsne_Y)), 
                                           method = "euclidean")[1])

AL_mean_mod = lmer(dist_from_prompt_mean ~  score + (score|prompt_id) + (score|L1_code), tsne_dims_AL_prompt_bin)
summary(AL_mean_mod)

## Linear mixed model fit by REML ['lmerMod']
## Formula: dist_from_prompt_mean ~ score + (score | prompt_id) + (score |  
##     L1_code)
##    Data: tsne_dims_AL_prompt_bin
## 
## REML criterion at convergence: 51805.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.9921 -0.5468 -0.0648  0.3809 15.1688 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev. Corr 
##  L1_code   (Intercept) 0.51954  0.7208        
##            score       0.06206  0.2491   -0.86
##  prompt_id (Intercept) 0.57750  0.7599        
##            score       0.03156  0.1777   -0.78
##  Residual              6.44028  2.5378        
## Number of obs: 11000, groups:  L1_code, 11; prompt_id, 8
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   4.6779     0.3609  12.963
## score        -0.2347     0.1023  -2.295
## 
## Correlation of Fixed Effects:
##       (Intr)
## score -0.815

There is a reliable effect of score: Lower scoring essays are further from the group mean.

1.2 Variance

Here are scatter and 2D density plots for each prompt and score bin.

tsne_AL_prompt_bins <- tsne_dims %>%
  filter(!is.na(score)) %>%
  group_by(prompt_id) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high")) %>%
         #score_bin = ntile(score, 2)
  ungroup()

ggplot(tsne_AL_prompt_bins, aes(color = score_bin, x = tsne_X, y = tsne_Y)) +
  facet_wrap(~prompt_id) + 
  geom_point(size = .2, alpha = .3) +
  theme_minimal()

ggplot(tsne_AL_prompt_bins, aes(color = score_bin, x = tsne_X, y = tsne_Y)) +
  facet_wrap(~prompt_id) + 
  geom_density2d(alpha = .5) + 
  theme_minimal()

Overall, it looks like there are a more outliers for the low bin.

Here we measure variance as the max distance between two essays in each bin.

tsne_diameters_LA_max <- tsne_dims %>%
  filter(!is.na(score)) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
         prompt_score = paste(prompt_id, score_bin, sep = "_")) %>%
  nest(1:2, .key = "doc_vector") %>%
  split(.$prompt_score) %>% 
  map_df(get_group_diameter, "prompt_score", "max")  %>%
  separate(prompt_score, c("prompt_id", "score_bin"), sep = "_")

tsne_diameters_LA_max %>%
  group_by(score_bin) %>%
  multi_boot_standard(col = "diameter") %>%
  ggplot(aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Diameter, averaging across prompts") +
  ylab("mean diameter (max within-group dist)") +
  theme_minimal()

Ranges are bootstrapped 95% CIs. Averaging across prompts, there is no difference betwen high and low scoring bins in terms of their variance. Is there a better measure of variance here?

What about diameter as the average distance to all other points:

tsne_diameters_LA_mean <- tsne_dims %>%
  filter(!is.na(score)) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
         prompt_score = paste(prompt_id, score_bin, sep = "_")) %>%
  nest(1:2, .key = "doc_vector") %>%
  split(.$prompt_score) %>% 
  map_df(get_group_diameter, "prompt_score", "mean_dist")  %>%
  separate(prompt_score, c("prompt_id", "score_bin"), sep = "_")

tsne_diameters_LA_mean %>%
  group_by(score_bin) %>%
  multi_boot_standard(col = "diameter") %>%
  ggplot(aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Diameter, averaging across prompts") +
  ylab("mean diameter (average dist to all within-group points)") +
  theme_minimal()

With this measure, there is a trend for low scoring essays to have more variance.

low = filter(tsne_diameters_LA_mean, score_bin == "low")
high = filter(tsne_diameters_LA_mean, score_bin == "high")

t.test(low$diameter, high$diameter, paired = TRUE)

## 
##  Paired t-test
## 
## data:  low$diameter and high$diameter
## t = 2.5956, df = 7, p-value = 0.03565
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.04046505 0.86901229
## sample estimates:
## mean of the differences 
##               0.4547387

In a paired t-test, the difference between score bins is reliable (depending on how you create the bins).

2 By Language

Get score bins

tsne_BL_prompt_bins <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  filter(!is.na(score)) %>%
  group_by(prompt_id, L1_code) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
         prompt_score_L1 = paste(prompt_id, score_bin, L1_code, sep = "_"),
         prompt_L1 = paste(prompt_id, L1_code, sep = "_")) %>%
         #score_bin = ntile(score, 2)
  ungroup()

#tsne_BL_prompt_bins %>%
#  group_by(L1_code, prompt_id, score_bin) %>%
#  summarize(n = n(),
#            mean = mean(score)) %>%
#  kable(caption = "Bin x prompt summary")

2.1 Mean

# prompt centers
tsne_centroids_BL_prompt <- tsne_BL_prompt_bins %>%
  split(.$prompt_L1) %>% 
  map_df(get_group_centroid, "prompt_L1") %>%
  rename(tsne_X_center = tsne_X,
         tsne_Y_center = tsne_Y) %>%
  separate(prompt_L1, c("prompt_id", "L1_code"), sep = "_") 

# prompt by score_bin centers
tsne_centroids_BL_prompt_bin <- tsne_BL_prompt_bins %>%
  split(.$prompt_score_L1) %>% 
  map_df(get_group_centroid, "prompt_score_L1") %>%
  separate(prompt_score_L1, c("prompt_id", "score_bin", "L1_code"), sep = "_") %>%
  left_join(tsne_centroids_BL_prompt)

Plot with mean of each prompt shown as black “x”, with low and high scoring mean for each prompt shown in green and pink. Each language is a facet

ggplot(tsne_centroids_BL_prompt_bin) +
  geom_point(aes(color = score_bin, x = tsne_X, y = tsne_Y), size = 1) +
  geom_point(aes(x = tsne_X_center, y = tsne_Y_center), shape = 4, color = "black") +
  facet_wrap(~L1_code)+
  ggtitle("Mean by prompt, score bin and language") +
  theme_minimal()

We fit a mixed-effect model predicting distance from prompt-language centroid for each language. Slopes by prompt are included as random effects.

tsne_dims_BL_prompt_bin <- tsne_dims %>%
  left_join(tsne_centroids_BL_prompt) %>%
  rowwise() %>%
  mutate(dist_from_prompt_mean = dist(rbind(c(tsne_X_center,
                                                   tsne_Y_center),
                                                 c(tsne_X, tsne_Y)), 
                                           method = "euclidean")[1])

tsne_dims_BL_prompt_bin %>%
  group_by(L1_code) %>%
  do(tidy(lmer(dist_from_prompt_mean ~  score + (score|prompt_id), data=.))) %>%
  filter(term == "score") %>%
  arrange(statistic) %>%
  kable()

L1_code	term	estimate	std.error	statistic	group
HIN	score	-0.6080019	0.0893076	-6.8079517	fixed
TEL	score	-0.5172388	0.0930001	-5.5617017	fixed
TUR	score	-0.2506226	0.1641483	-1.5268063	fixed
CHI	score	-0.3083662	0.2062118	-1.4953859	fixed
KOR	score	-0.1485492	0.2310076	-0.6430489	fixed
GER	score	-0.0789303	0.1467492	-0.5378584	fixed
FRE	score	-0.0196478	0.2101347	-0.0935008	fixed
ITA	score	-0.0020177	0.1189763	-0.0169585	fixed
JPN	score	0.0109130	0.2643309	0.0412853	fixed
SPA	score	0.0082294	0.0897698	0.0916723	fixed
ARA	score	0.0499715	0.1614033	0.3096064	fixed

The effect of score on mean distance only holds for some languages.

2.2 Variance

Diameter as the average distance to all other points:

tsne_diameters_LB_mean <- tsne_dims %>%
  filter(!is.na(score)) %>%
  mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
         prompt_score_L1 = paste(prompt_id, score_bin, L1_code, sep = "_")) %>%
  nest(1:2, .key = "doc_vector") %>%
  split(.$prompt_score_L1) %>% 
  map_df(get_group_diameter, "prompt_score_L1", "mean_dist")  %>%
  separate(prompt_score_L1, c("prompt_id", "score_bin", "L1_code"), sep = "_") 


tsne_diameters_LB_mean %>%
  group_by(score_bin, L1_code) %>%
  multi_boot_standard(col = "diameter", na.rm = T) %>%
  ggplot(aes(x = score_bin, 
                      y = mean, 
                      fill = score_bin)) +
  geom_bar(stat = "identity") +
  facet_wrap(~L1_code) +
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
  ggtitle("Diameter, averaging across prompts") +
  ylab("mean diameter (average dist to all within-group points)") +
  theme_minimal()

tsne_diameters_LB_mean_wide = tsne_diameters_LB_mean %>%
  filter(!is.na(diameter)) %>%
    spread(L1_code, diameter)

lapply(tsne_diameters_LB_mean_wide %>% select(-prompt_id, -score_bin), function(x) 
                   t.test(x ~ tsne_diameters_LB_mean_wide$score_bin, paired = TRUE)$statistic) %>%
  bind_cols() %>%
  gather("L1_code", "t.value") %>%
  arrange(t.value) %>%
  kable()

With this measure, there is a trend for low scoring essays to have more variance (averaging across prompts).

L2ETS Study 1 analyses - Scores

Molly Lewis

2017-10-04