Here we look at the difference in the distributions of low and high scoring esssays along two measures, mean and variance. We do two versions of this analysis: (A) Pooling across all languages, and (B) By language. For each, we downsample so that there are the same number in each lang x prompt x score bin.
SUMMARY: Lower scoring languages are closer to the mean (both pooling across languages and by language). More variance for lower scoring essays, but this is driven by just 1 or 2 languages
Parameters
CUTOFF <- 4
N_PER_L_SCORE_PROMPT <- 50
MIN_SCORE <- 2
Read in doctag indices, docvecs, and metadata
doctag_indices <- read_feather("../../all_data/doctag_indices.feather")
docvecs <- read_feather("../../all_data/docvecs.feather") %>%
as.data.frame() %>%
mutate(offset = 0:(n()-1)) %>%
select(offset, everything())
#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../../all_data/raw/merged_metadata.csv")
metadata_clean <- metadata %>%
mutate_if(is.character, as.factor) %>%
mutate(essay_id = as.character(essay_id))
n_scores_not_na <- metadata_clean %>%
filter(!is.na(score)) %>%
nrow()
Of all essays, we have scores for 91% of essays.
Merge all data sources together
d <- doctag_indices %>%
left_join(metadata_clean) %>%
left_join(docvecs) %>%
nest(-1:-10, .key = "doc_vector") %>%
select(-doc_count, -offset, -test_center_country_code) %>%
mutate(prompt_id = fct_recode(prompt_id,
broad.academics = "VC079857",
young.enjoy = "VC139555",
young.help = "VC182427",
advertisements = "VC199494",
fewer.cars = "VC199744",
tour.travel = "VC212639",
students.facts = "VC247638",
success.risks = "VC251653"))
Get TSNE coordinates. These are the same for both the all-languages and by-language analyses.
#mats <- d %>%
# select(doc_vector) %>%
# unnest() %>%
# as.matrix()
#tsne_out = Rtsne::Rtsne(mats)
#tsne_dims <- tsne_out$Y %>%
# as.data.frame() %>%
# rename(tsne_X = V1,
# tsne_Y = V2) %>%
# bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
# select(everything(), tsne_X, tsne_Y)
# write_csv(tsne_dims, "../all_data/tsne_dims_cached.csv")
tsne_dims <- read_csv("../../all_data/tsne_dims_cached.csv")
Scores are unevenly distributed across languages; need to downsample. Here were doing 50 per language, score_bin and prompt.
ggplot(tsne_dims) +
geom_histogram(aes(x = score, fill = L1_code), binwidth = .5) +
facet_wrap(~L1_code) +
theme_bw() +
theme(legend.position = "none")
Downsample essays.
tsne_dims_downsampled <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
filter(score > MIN_SCORE) %>%
filter(!is.na(score)) %>%
group_by(L1_code, score, prompt_id) %>%
sample_n(N_PER_L_SCORE_PROMPT, replace = T) %>%
mutate(score_bin = ifelse(score < CUTOFF, "low", "high"),
score_prompt = paste(score_bin, prompt_id, sep = "_"),
L1_score_prompt = paste(L1_code, score_bin, prompt_id, sep = "_"),
L1_prompt = paste(L1_code, prompt_id, sep = "_")) %>%
ungroup()
The bins:
tsne_dims_downsampled %>%
group_by(score_bin) %>%
distinct(score)
## # A tibble: 6 x 2
## # Groups: score_bin [2]
## score score_bin
## <dbl> <chr>
## 1 2.5 low
## 2 3.0 low
## 3 3.5 low
## 4 4.0 high
## 5 4.5 high
## 6 5.0 high
Centroid = grand center
get_group_centroid <- function(doc_vecs_df, group_name){
centroid <- doc_vecs_df %>%
select(doc_vector) %>%
unnest(doc_vector) %>%
colMeans()
this_group_name <- doc_vecs_df %>%
select_(group_name) %>%
slice(1) %>%
unlist()
df <- data.frame(t(centroid)) %>%
mutate(x = this_group_name) %>%
select(x, everything())
names(df)[1] = c(as.name(group_name))
df
}
# L1_score_prompt centers
tsne_centroids_BL_score_prompt <- tsne_dims_downsampled %>%
split(.$L1_score_prompt) %>%
map_df(get_group_centroid, "L1_score_prompt") %>%
separate(L1_score_prompt, c("L1_code", "score_bin", "prompt_id"), "_") %>%
nest(4:5, .key = "doc_vector")
grand_centroids <- tsne_centroids_BL_score_prompt %>% # SPL
mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
split(.$L1_prompt) %>% #
map_df(get_group_centroid, "L1_prompt") %>% # PL
separate(L1_prompt, c("L1_code", "prompt_id"), "_") %>%
group_by(prompt_id) %>%
summarize(tsne_X = mean(tsne_X), #P
tsne_Y = mean(tsne_Y)) %>%
mutate(group = "all") %>%
group_by(group) %>%
summarize(center_tsne_X = mean(tsne_X), # 1
center_tsne_Y = mean(tsne_Y))
group_centroids <- tsne_centroids_BL_score_prompt %>% # SPL
mutate(prompt_score = paste0(prompt_id, "_" , score_bin)) %>%
split(.$prompt_score) %>%
map_df(get_group_centroid, "prompt_score") %>% # SP
mutate(group = "all") %>%
separate(prompt_score, c("prompt_id", "score_bin"), "_")
distance_measures <- group_centroids %>%
left_join(grand_centroids) %>%
rowwise() %>%
mutate(dist_from_grand_mean = dist(rbind(c(center_tsne_X,
center_tsne_Y),
c(tsne_X, tsne_Y)), method = "euclidean")[1])
bin_dists <- distance_measures %>%
group_by(score_bin) %>%
multi_boot_standard(col = "dist_from_grand_mean")
kable(bin_dists)
| score_bin | ci_lower | ci_upper | mean |
|---|---|---|---|
| high | 13.24068 | 23.42909 | 18.66798 |
| low | 11.13162 | 21.34514 | 16.64381 |
ggplot(bin_dists, aes(x = score_bin,
y = mean,
fill = score_bin)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
ggtitle("Distance from grand centroid (bootstrapping across prompts)") +
ylab("distance from grand centroid") +
theme_minimal()
Model
model_data <- tsne_dims_downsampled %>%
mutate(group = "all") %>%
left_join(grand_centroids) %>%
unnest() %>%
rowwise() %>%
mutate(dist_from_grand_mean =
dist(rbind(c(center_tsne_X, center_tsne_Y),
c(tsne_X, tsne_Y)), method = "euclidean")[1])
# these don't converge
#model <- lmer(dist_from_grand_mean ~ score_bin + (score_bin|prompt_id) + (prompt_id|L1_code) + (score_bin|L1_code), data= model_data, control=lmerControl(optimizer = "bobyqa"))
#model <- lmer(dist_from_grand_mean ~ score_bin + (score_bin|prompt_id) + (prompt_id|L1_code) , data= model_data, control=lmerControl(optimizer = "bobyqa"))
#model <- lmer(dist_from_grand_mean ~ score_bin + (prompt_id|L1_code) , data= model_data, control=lmerControl(optimizer = "bobyqa"))
model <- lmer(dist_from_grand_mean ~ score_bin + (1|L1_code) + (1|prompt_id) , data= model_data, control=lmerControl(optimizer = "bobyqa"))
summary(model)
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## dist_from_grand_mean ~ score_bin + (1 | L1_code) + (1 | prompt_id)
## Data: model_data
## Control: lmerControl(optimizer = "bobyqa")
##
## REML criterion at convergence: 25523
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.1360 -0.6089 0.0694 0.7035 4.7005
##
## Random effects:
## Groups Name Variance Std.Dev.
## L1_code (Intercept) 0.8699 0.9327
## prompt_id (Intercept) 54.2244 7.3637
## Residual 7.5540 2.7485
## Number of obs: 5230, groups: L1_code, 11; prompt_id, 8
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 19.16907 2.61916 7.319
## score_binlow -1.89676 0.07603 -24.949
##
## Correlation of Fixed Effects:
## (Intr)
## score_binlw -0.014
Lowering scoring languages are farther away from the overall group mean.
Here are scatter and 2D density plots for each prompt and score bin.
ggplot(tsne_dims_downsampled %>% unnest(), aes(color = score_bin, x = tsne_X, y = tsne_Y)) +
facet_wrap(~prompt_id) +
geom_point(size = .2, alpha = .3) +
theme_minimal()
Averaging across prompts, there is no difference betwen high and low scoring bins in terms of their variance when measured as max distance. Here is diameter as the average distance to all other points:
tsne_diameters_LA_mean <- tsne_dims_downsampled %>%
split(.$score_prompt) %>%
map_df(get_group_diameter, "score_prompt", "mean_dist") %>%
separate(score_prompt, c("score_bin","prompt_id"), sep = "_")
tsne_diameters_LA_mean %>%
group_by(score_bin) %>%
multi_boot_standard(col = "diameter") %>%
ggplot(aes(x = score_bin,
y = mean,
fill = score_bin)) +
geom_bar(stat = "identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
ggtitle("Diameter, averaging across prompts") +
ylab("mean diameter (average dist to all within-group points)") +
theme_minimal()
With this measure, there is a trend for low scoring essays to have more variance.
low = filter(tsne_diameters_LA_mean, score_bin == "low")
high = filter(tsne_diameters_LA_mean, score_bin == "high")
t.test(low$diameter, high$diameter, paired = TRUE)
##
## Paired t-test
##
## data: low$diameter and high$diameter
## t = 2.6818, df = 7, p-value = 0.03145
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.05742617 0.91354893
## sample estimates:
## mean of the differences
## 0.4854876
In a paired t-test, the difference between score bins is reliable But, this trend is driven by Arabic, Telugu and Hindi (see below).
By languages - Language centroid
grand_centroids <- tsne_centroids_BL_score_prompt %>% # SPL
mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
split(.$L1_prompt) %>%
map_df(get_group_centroid, "L1_prompt") %>% # PL
separate(L1_prompt, c("L1_code", "prompt_id"), "_") %>%
nest(3:4, .key = "doc_vector") %>%
split(.$L1_code) %>%
map_df(get_group_centroid, "L1_code") %>%
rename(center_tsne_X = tsne_X,
center_tsne_Y = tsne_Y)
distance_measures <- tsne_centroids_BL_score_prompt %>%
unnest() %>%
left_join(grand_centroids) %>%
rowwise () %>%
mutate(dist_from_grand_mean =
dist(rbind(c(center_tsne_X, center_tsne_Y),
c(tsne_X, tsne_Y)), method = "euclidean")[1])
bin_dists <- distance_measures %>%
group_by(L1_code, score_bin) %>%
multi_boot_standard(col = "dist_from_grand_mean")
ggplot(bin_dists, aes(x = L1_code, y = mean, group = score_bin, fill = score_bin)) +
geom_bar(stat = "identity", position = position_dodge() ) +
xlab("Native language") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), position = position_dodge(.9)) +
ylab("Euclidean distance from language centroid") +
theme_minimal() +
theme(text = element_text(size = 15),
axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_brewer(palette = "Set1", name = "Score")
kable(bin_dists)
| L1_code | score_bin | ci_lower | ci_upper | mean |
|---|---|---|---|---|
| ARA | high | 12.737239 | 23.40105 | 18.58388 |
| ARA | low | 10.657336 | 21.26739 | 16.41873 |
| CHI | high | 12.681038 | 23.56500 | 18.41718 |
| CHI | low | 10.201051 | 21.53665 | 15.94971 |
| FRE | high | 12.519551 | 24.37193 | 18.55082 |
| FRE | low | 10.397365 | 21.28758 | 16.28284 |
| GER | high | 12.806644 | 23.45367 | 18.16070 |
| GER | low | 11.091125 | 21.81126 | 16.61737 |
| HIN | high | 15.795136 | 24.28444 | 20.31462 |
| HIN | low | 14.407386 | 24.02795 | 19.31501 |
| ITA | high | 12.563921 | 22.84933 | 18.10858 |
| ITA | low | 10.404202 | 21.08777 | 16.13474 |
| JPN | high | 13.019992 | 23.36015 | 18.06883 |
| JPN | low | 10.014038 | 20.52140 | 15.57402 |
| KOR | high | 13.107354 | 23.26556 | 18.64588 |
| KOR | low | 10.733554 | 20.95135 | 16.20982 |
| SPA | high | 11.717962 | 23.14103 | 17.80769 |
| SPA | low | 9.925172 | 20.90527 | 15.84656 |
| TEL | high | 15.703953 | 24.83517 | 20.64367 |
| TEL | low | 14.397329 | 22.97448 | 19.03851 |
| TUR | high | 13.485145 | 23.02447 | 18.62233 |
| TUR | low | 11.661252 | 21.54227 | 16.84659 |
Model
model_data <- tsne_dims_downsampled %>%
left_join(grand_centroids) %>%
unnest() %>%
rowwise() %>%
mutate(dist_from_grand_mean =
dist(rbind(c(center_tsne_X, center_tsne_Y),
c(tsne_X, tsne_Y)), method = "euclidean")[1])
model_data %>%
group_by(L1_code) %>%
do(tidy(lmer(dist_from_grand_mean ~ score_bin + (score_bin|prompt_id), data=.,control=lmerControl(optimizer = "bobyqa")))) %>%
filter(term == "score_binlow") %>%
arrange(statistic) %>%
kable()
| L1_code | term | estimate | std.error | statistic | group |
|---|---|---|---|---|---|
| CHI | score_binlow | -2.522905 | 0.3440170 | -7.333662 | fixed |
| KOR | score_binlow | -2.260177 | 0.3341653 | -6.763650 | fixed |
| SPA | score_binlow | -1.868965 | 0.2981305 | -6.268951 | fixed |
| FRE | score_binlow | -2.212007 | 0.4367975 | -5.064146 | fixed |
| TEL | score_binlow | -1.500903 | 0.3191746 | -4.702451 | fixed |
| TUR | score_binlow | -1.712177 | 0.4142930 | -4.132767 | fixed |
| JPN | score_binlow | -2.221086 | 0.6025168 | -3.686347 | fixed |
| ITA | score_binlow | -1.739164 | 0.4896557 | -3.551809 | fixed |
| ARA | score_binlow | -2.205054 | 0.6458240 | -3.414327 | fixed |
| GER | score_binlow | -1.333625 | 0.4820483 | -2.766580 | fixed |
| HIN | score_binlow | -1.117017 | 0.4340396 | -2.573536 | fixed |
Across all languages, lower scoring essays are closer to group mean.
bin_dists_low <- distance_measures %>%
group_by(L1_code, score_bin) %>%
summarize(sd = sd(dist_from_grand_mean),
mean = mean(dist_from_grand_mean),
n = n()) %>%
filter(score_bin == "low") %>%
rename(low_sd = sd,
low_mean = mean,
low_n = n)
bin_dists_high <- distance_measures %>%
group_by(L1_code, score_bin) %>%
summarize(sd = sd(dist_from_grand_mean),
mean = mean(dist_from_grand_mean),
n = n()) %>%
filter(score_bin == "high") %>%
rename(high_sd = sd,
high_mean = mean,
high_n = n)
all_dists = inner_join(bin_dists_low, bin_dists_high, by = "L1_code")
effect_sizes = all_dists %>%
rowwise() %>%
mutate(d = compute.es::mes( high_mean, low_mean, high_sd, low_sd, high_n * 10, low_n * 10, verbose = FALSE)$d)
effect_sizes %>%
select(L1_code, d) %>%
arrange(-d) %>%
kable()
| L1_code | d |
|---|---|
| JPN | 0.31 |
| KOR | 0.31 |
| CHI | 0.29 |
| ARA | 0.26 |
| FRE | 0.26 |
| ITA | 0.24 |
| SPA | 0.23 |
| TEL | 0.23 |
| TUR | 0.23 |
| GER | 0.18 |
| HIN | 0.14 |
Diameter as the average distance to all other points:
tsne_diameters_LB_mean <- tsne_dims_downsampled %>%
split(.$L1_score_prompt) %>%
map_df(get_group_diameter, "L1_score_prompt", "mean_dist") %>%
separate(L1_score_prompt, c("L1_code" ,"score_bin", "prompt_id"), sep = "_")
tsne_diameters_LB_mean %>%
group_by(score_bin, L1_code) %>%
multi_boot_standard(col = "diameter", na.rm = T) %>%
ggplot(aes(x = score_bin,
y = mean,
fill = score_bin)) +
geom_bar(stat = "identity") +
facet_wrap(~L1_code) +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper)) +
ggtitle("Diameter, averaging across prompts") +
ylab("mean diameter (average dist to all within-group points)") +
theme_minimal()
tsne_diameters_LB_mean_wide = tsne_diameters_LB_mean %>%
filter(!is.na(diameter)) %>%
spread(L1_code, diameter)
lapply(tsne_diameters_LB_mean_wide %>% select(-prompt_id, -score_bin), function(x)
t.test(x ~ tsne_diameters_LB_mean_wide$score_bin, paired = TRUE)$statistic) %>%
bind_cols() %>%
gather("L1_code", "t.value") %>%
arrange(t.value) %>%
kable()
| L1_code | t.value |
|---|---|
| TEL | -2.3056800 |
| HIN | -1.8555782 |
| ARA | -1.1745673 |
| KOR | -0.6999361 |
| GER | -0.5066852 |
| SPA | -0.4778886 |
| TUR | -0.3570271 |
| ITA | -0.2680945 |
| CHI | -0.2591910 |
| JPN | -0.2027548 |
| FRE | 0.3216314 |
There’s no effect here, except for a few languags.