Here, we look at the difference in the distributions of low and high scoring esssays along two measures, mean and variance. We do two versions of this analysis: (1) Pooling across all languages, and (2) By language. For each, we first average across prompt.
Note that the exact numbers are dependent on the tSNE dimensions (should probably average across models).
Read in doctag indices, docvecs, and metadata
doctag_indices <- read_feather("../all_data/doctag_indices.feather")
docvecs <- read_feather("../all_data/docvecs.feather") %>%
as.data.frame() %>%
mutate(offset = 0:(n()-1)) %>%
select(offset, everything())
#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../all_data/merged_metadata.csv")
metadata_clean <- metadata %>%
mutate_if(is.character, as.factor) %>%
mutate(essay_id = as.character(essay_id))
Score summaries
Set low vs. high bin score cutoff.
CUTOFF <- 3.5
#median(metadata_clean$score, na.rm = T)
ggplot(metadata_clean, aes(x = score)) +
geom_histogram(binwidth = .5) +
geom_vline(aes(xintercept = CUTOFF), color = "red") +
ggtitle("Score distribution") +
theme_minimal()
n_scores_not_na <- metadata_clean %>%
filter(!is.na(score)) %>%
nrow()
metadata_clean %>%
count(L1_code)
## # A tibble: 11 x 2
## L1_code n
## <fctr> <int>
## 1 ARA 1100
## 2 CHI 1100
## 3 FRE 1100
## 4 GER 1100
## 5 HIN 1100
## 6 ITA 1100
## 7 JPN 1100
## 8 KOR 1100
## 9 SPA 1100
## 10 TEL 1100
## 11 TUR 1100
Of all essays, we have scores for 91% of essays.
Merge all data sources together
d <- doctag_indices %>%
left_join(metadata_clean) %>%
left_join(docvecs) %>%
nest(-1:-10, .key = "doc_vector") %>%
select(-doc_count, -offset, -test_center_country_code) %>%
mutate(prompt_id = fct_recode(prompt_id,
broad.academics = "VC079857",
young.enjoy = "VC139555",
young.help = "VC182427",
advertisements = "VC199494",
fewer.cars = "VC199744",
tour.travel = "VC212639",
students.facts = "VC247638",
success.risks = "VC251653"))
Get TSNE coordinates. These are the same for both the all-languages and by-language analyses.
#mats <- d %>%
# select(doc_vector) %>%
# unnest() %>%
# as.matrix()
#tsne_out = Rtsne::Rtsne(mats)
#tsne_dims <- tsne_out$Y %>%
# as.data.frame() %>%
# rename(tsne_X = V1,
# tsne_Y = V2) %>%
# bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
# select(everything(), tsne_X, tsne_Y)
# write_csv(tsne_dims, "../all_data/tsne_dims_cached.csv")
tsne_dims <- read_csv("../all_data/tsne_dims_cached.csv")
Get score bins
tsne_AL_prompt_bins <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
filter(!is.na(score)) %>%
group_by(prompt_id) %>%
mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
prompt_score = paste(prompt_id, score_bin, sep = "_")) %>%
#score_bin = ntile(score, 2)
ungroup()
tsne_AL_prompt_bins %>%
group_by(prompt_id, score_bin) %>%
summarize(n = n(),
mean = mean(score)) %>%
kable(caption = "Bin x prompt summary")
| prompt_id | score_bin | n | mean |
|---|---|---|---|
| advertisements | high | 681 | 4.049192 |
| advertisements | low | 670 | 2.538060 |
| broad.academics | high | 822 | 4.077859 |
| broad.academics | low | 699 | 2.607296 |
| fewer.cars | high | 855 | 4.089474 |
| fewer.cars | low | 681 | 2.621880 |
| students.facts | high | 829 | 4.038601 |
| students.facts | low | 721 | 2.601942 |
| success.risks | high | 765 | 4.005229 |
| success.risks | low | 763 | 2.617955 |
| tour.travel | high | 456 | 4.080044 |
| tour.travel | low | 390 | 2.578205 |
| young.enjoy | high | 809 | 4.038319 |
| young.enjoy | low | 610 | 2.675410 |
| young.help | high | 660 | 4.072727 |
| young.help | low | 589 | 2.558574 |
get_group_centroid <- function(doc_vecs_df, group_name){
centroid <- doc_vecs_df %>%
select(doc_vector) %>%
unnest(doc_vector) %>%
colMeans()
this_group_name <- doc_vecs_df %>%
select_(group_name) %>%
slice(1) %>%
unlist()
df <- data.frame(t(centroid)) %>%
mutate(x = this_group_name) %>%
select(x, everything())
names(df)[1] = c(as.name(group_name))
df
}
# prompt centers
tsne_centroids_AL_prompt <- tsne_AL_prompt_bins %>%
split(.$prompt_id) %>%
map_df(get_group_centroid, "prompt_id") %>%
rename(tsne_X_center = tsne_X,
tsne_Y_center = tsne_Y)
# prompt by score_bin centers
tsne_centroids_AL_prompt_bin <- tsne_AL_prompt_bins %>%
split(.$prompt_score) %>%
map_df(get_group_centroid, "prompt_score") %>%
separate(prompt_score, c("prompt_id", "score_bin"), sep = "_") %>%
left_join(tsne_centroids_AL_prompt)
Plot with mean of each prompt shown as black “x”, with low and high scoring mean for each prompt shown in green and pink.
ggplot(tsne_centroids_AL_prompt_bin) +
geom_point(aes(color = score_bin, x = tsne_X, y = tsne_Y), size = 1.5) +
geom_point(aes(x = tsne_X_center, y = tsne_Y_center), shape = 4, color = "black") +
ggtitle("Mean by prompt and score bin") +
#geom_text_repel(aes(label = prompt_id, x = tsne_X_center, y = tsne_Y_center)) +
theme_minimal()
Low scoring essays tend to be further away from the prompt mean (centroid). Note, however, that there can be many more essays in the “high” bin relative to the “low” bin, depending on how bins are split.
Here is a random effect model predicting distance from prompt mean with score, with prompt_id and L1_code as random effects (by slope and intercept).
tsne_dims_AL_prompt_bin <- tsne_dims %>%
left_join(tsne_centroids_AL_prompt) %>%
rowwise() %>%
mutate(dist_from_prompt_mean = dist(rbind(c(tsne_X_center,
tsne_Y_center),
c(tsne_X, tsne_Y)),
method = "euclidean")[1])
AL_mean_mod = lmer(dist_from_prompt_mean ~ score + (score|prompt_id) + (score|L1_code), tsne_dims_AL_prompt_bin)
summary(AL_mean_mod)
## Linear mixed model fit by REML ['lmerMod']
## Formula: dist_from_prompt_mean ~ score + (score | prompt_id) + (score |
## L1_code)
## Data: tsne_dims_AL_prompt_bin
##
## REML criterion at convergence: 51805.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.9921 -0.5468 -0.0648 0.3809 15.1688
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## L1_code (Intercept) 0.51954 0.7208
## score 0.06206 0.2491 -0.86
## prompt_id (Intercept) 0.57750 0.7599
## score 0.03156 0.1777 -0.78
## Residual 6.44028 2.5378
## Number of obs: 11000, groups: L1_code, 11; prompt_id, 8
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 4.6779 0.3609 12.963
## score -0.2347 0.1023 -2.295
##
## Correlation of Fixed Effects:
## (Intr)
## score -0.815
There is a reliable effect of score: Lower scoring essays are further from the group mean.
Here are scatter and 2D density plots for each prompt and score bin.
tsne_AL_prompt_bins <- tsne_dims %>%
filter(!is.na(score)) %>%
group_by(prompt_id) %>%
mutate(score_bin = ifelse(score < CUTOFF, "low", "high")) %>%
#score_bin = ntile(score, 2)
ungroup()
ggplot(tsne_AL_prompt_bins, aes(color = score_bin, x = tsne_X, y = tsne_Y)) +
facet_wrap(~prompt_id) +
geom_point(size = .2, alpha = .3) +
theme_minimal()
ggplot(tsne_AL_prompt_bins, aes(color = score_bin, x = tsne_X, y = tsne_Y)) +
facet_wrap(~prompt_id) +
geom_density2d(alpha = .5) +
theme_minimal()
Overall, it looks like there are a more outliers for the low bin.
Here we measure variance as the max distance between two essays in each bin.
tsne_diameters_LA_max <- tsne_dims %>%
filter(!is.na(score)) %>%
mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
prompt_score = paste(prompt_id, score_bin, sep = "_")) %>%
nest(1:2, .key = "doc_vector") %>%
split(.$prompt_score) %>%
map_df(get_group_diameter, "prompt_score", "max") %>%
separate(prompt_score, c("prompt_id", "score_bin"), sep = "_")
tsne_diameters_LA_max %>%
group_by(score_bin) %>%
multi_boot_standard(col = "diameter") %>%
ggplot(aes(x = score_bin,
y = mean,
fill = score_bin)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
ggtitle("Diameter, averaging across prompts") +
ylab("mean diameter (max within-group dist)") +
theme_minimal()
Ranges are bootstrapped 95% CIs. Averaging across prompts, there is no difference betwen high and low scoring bins in terms of their variance. Is there a better measure of variance here?
What about diameter as the average distance to all other points:
tsne_diameters_LA_mean <- tsne_dims %>%
filter(!is.na(score)) %>%
mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
prompt_score = paste(prompt_id, score_bin, sep = "_")) %>%
nest(1:2, .key = "doc_vector") %>%
split(.$prompt_score) %>%
map_df(get_group_diameter, "prompt_score", "mean_dist") %>%
separate(prompt_score, c("prompt_id", "score_bin"), sep = "_")
tsne_diameters_LA_mean %>%
group_by(score_bin) %>%
multi_boot_standard(col = "diameter") %>%
ggplot(aes(x = score_bin,
y = mean,
fill = score_bin)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
ggtitle("Diameter, averaging across prompts") +
ylab("mean diameter (average dist to all within-group points)") +
theme_minimal()
With this measure, there is a trend for low scoring essays to have more variance.
low = filter(tsne_diameters_LA_mean, score_bin == "low")
high = filter(tsne_diameters_LA_mean, score_bin == "high")
t.test(low$diameter, high$diameter, paired = TRUE)
##
## Paired t-test
##
## data: low$diameter and high$diameter
## t = 2.5956, df = 7, p-value = 0.03565
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.04046505 0.86901229
## sample estimates:
## mean of the differences
## 0.4547387
In a paired t-test, the difference between score bins is reliable (depending on how you create the bins).
Get score bins
tsne_BL_prompt_bins <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
filter(!is.na(score)) %>%
group_by(prompt_id, L1_code) %>%
mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
prompt_score_L1 = paste(prompt_id, score_bin, L1_code, sep = "_"),
prompt_L1 = paste(prompt_id, L1_code, sep = "_")) %>%
#score_bin = ntile(score, 2)
ungroup()
#tsne_BL_prompt_bins %>%
# group_by(L1_code, prompt_id, score_bin) %>%
# summarize(n = n(),
# mean = mean(score)) %>%
# kable(caption = "Bin x prompt summary")
# prompt centers
tsne_centroids_BL_prompt <- tsne_BL_prompt_bins %>%
split(.$prompt_L1) %>%
map_df(get_group_centroid, "prompt_L1") %>%
rename(tsne_X_center = tsne_X,
tsne_Y_center = tsne_Y) %>%
separate(prompt_L1, c("prompt_id", "L1_code"), sep = "_")
# prompt by score_bin centers
tsne_centroids_BL_prompt_bin <- tsne_BL_prompt_bins %>%
split(.$prompt_score_L1) %>%
map_df(get_group_centroid, "prompt_score_L1") %>%
separate(prompt_score_L1, c("prompt_id", "score_bin", "L1_code"), sep = "_") %>%
left_join(tsne_centroids_BL_prompt)
Plot with mean of each prompt shown as black “x”, with low and high scoring mean for each prompt shown in green and pink. Each language is a facet
ggplot(tsne_centroids_BL_prompt_bin) +
geom_point(aes(color = score_bin, x = tsne_X, y = tsne_Y), size = 1) +
geom_point(aes(x = tsne_X_center, y = tsne_Y_center), shape = 4, color = "black") +
facet_wrap(~L1_code)+
ggtitle("Mean by prompt, score bin and language") +
theme_minimal()
We fit a mixed-effect model predicting distance from prompt-language centroid for each language. Slopes by prompt are included as random effects.
tsne_dims_BL_prompt_bin <- tsne_dims %>%
left_join(tsne_centroids_BL_prompt) %>%
rowwise() %>%
mutate(dist_from_prompt_mean = dist(rbind(c(tsne_X_center,
tsne_Y_center),
c(tsne_X, tsne_Y)),
method = "euclidean")[1])
tsne_dims_BL_prompt_bin %>%
group_by(L1_code) %>%
do(tidy(lmer(dist_from_prompt_mean ~ score + (score|prompt_id), data=.))) %>%
filter(term == "score") %>%
arrange(statistic) %>%
kable()
| L1_code | term | estimate | std.error | statistic | group |
|---|---|---|---|---|---|
| HIN | score | -0.6080019 | 0.0893076 | -6.8079517 | fixed |
| TEL | score | -0.5172388 | 0.0930001 | -5.5617017 | fixed |
| TUR | score | -0.2506226 | 0.1641483 | -1.5268063 | fixed |
| CHI | score | -0.3083662 | 0.2062118 | -1.4953859 | fixed |
| KOR | score | -0.1485492 | 0.2310076 | -0.6430489 | fixed |
| GER | score | -0.0789303 | 0.1467492 | -0.5378584 | fixed |
| FRE | score | -0.0196478 | 0.2101347 | -0.0935008 | fixed |
| ITA | score | -0.0020177 | 0.1189763 | -0.0169585 | fixed |
| JPN | score | 0.0109130 | 0.2643309 | 0.0412853 | fixed |
| SPA | score | 0.0082294 | 0.0897698 | 0.0916723 | fixed |
| ARA | score | 0.0499715 | 0.1614033 | 0.3096064 | fixed |
The effect of score on mean distance only holds for some languages.
Diameter as the average distance to all other points:
tsne_diameters_LB_mean <- tsne_dims %>%
filter(!is.na(score)) %>%
mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
prompt_score_L1 = paste(prompt_id, score_bin, L1_code, sep = "_")) %>%
nest(1:2, .key = "doc_vector") %>%
split(.$prompt_score_L1) %>%
map_df(get_group_diameter, "prompt_score_L1", "mean_dist") %>%
separate(prompt_score_L1, c("prompt_id", "score_bin", "L1_code"), sep = "_")
tsne_diameters_LB_mean %>%
group_by(score_bin, L1_code) %>%
multi_boot_standard(col = "diameter", na.rm = T) %>%
ggplot(aes(x = score_bin,
y = mean,
fill = score_bin)) +
geom_bar(stat = "identity") +
facet_wrap(~L1_code) +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
ggtitle("Diameter, averaging across prompts") +
ylab("mean diameter (average dist to all within-group points)") +
theme_minimal()
tsne_diameters_LB_mean_wide = tsne_diameters_LB_mean %>%
filter(!is.na(diameter)) %>%
spread(L1_code, diameter)
lapply(tsne_diameters_LB_mean_wide %>% select(-prompt_id, -score_bin), function(x)
t.test(x ~ tsne_diameters_LB_mean_wide$score_bin, paired = TRUE)$statistic) %>%
bind_cols() %>%
gather("L1_code", "t.value") %>%
arrange(t.value) %>%
kable()
With this measure, there is a trend for low scoring essays to have more variance (averaging across prompts).