QUESTION: Are essays that scored higher closer to native English speakers? Predict that pairwise word distances for high scoring essays should be closer to English than low scoring essays.
Here are the differences in correlation between ETS and English wikpedia, comparing high ETS to low ETS (positive => h > l).
corrs <- read_csv("score_pairwise_corrs_no_stops.csv", col_names = F) %>%
select(X1, X2, X3) %>%
rename(lang = X1,
score_group = X2,
corr = X3) %>%
spread(score_group, corr) %>%
mutate(diff = high-low)
ggplot(corrs, aes(x = reorder(lang,-diff), lang, y = diff)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.5), fill = "grey")+
xlab("L2 language") +
ylab("difference in correlaiton between high and \nlow scoring essays (h-l)") +
ggtitle("Diff") +
theme_classic()
t.test(corrs$diff)
##
## One Sample t-test
##
## data: corrs$diff
## t = 3.5617, df = 34, p-value = 0.001114
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.01659506 0.06069463
## sample estimates:
## mean of x
## 0.03864484
Broadly this is true. Higher scoring essays - on average - are more correlated with English than lower scoring essays. But it’s not universally true.
ggplot(corrs, aes(x = reorder(lang,-low), y = low)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.5), fill = "lightblue")+
xlab("L2 language") +
ylab("Correlation for low scoring essays") +
ggtitle("Low scoring essays") +
theme_classic()
ggplot(corrs, aes(x = reorder(lang,-high), y = high)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.5), fill = "pink")+
xlab("L2 language") +
ylab("Correlation for low scoring essays") +
ggtitle("High scoring essays") +
theme_classic()
There’s something funny going on here….low and high are correlations are themselves correlated with eachother. Languages that have high low scoring essays have low high scoring essays…..why would this be??
ggplot(corrs, aes(x = high, y = low)) +
geom_text(aes(label = lang)) +
theme_classic() +
ggtitle("Correlation between low and high correlations") +
geom_smooth(method = "lm")
meta_mean_score <- read_csv("../../../data/raw/merged_metadata.csv") %>%
mutate(essay_id = as.character(essay_id),
score_bin = ifelse(score < 3, "low", "high")) %>%
select(essay_id, L1_code, score_bin, score) %>%
group_by(score_bin, L1_code) %>%
summarize(mean_score = mean(score))
bad_langs <- c("VIE", "GER" ,"IBO" ,"YOR","THA" ,"URD" ,"FAS","TGL") # these langs don't have all words
native_corrs <- read_csv("score_pairwise_corrs_no_stops_native_langs.csv", col_names = F) %>%
select(X1, X2, X3, X4) %>%
rename(wiki_lang = X1,
ets_lang = X2,
score_group = X3,
corr = X4) %>%
filter(!(wiki_lang %in% bad_langs)) %>%
filter(!(ets_lang %in% bad_langs)) %>%
mutate(group = case_when(wiki_lang == ets_lang ~ "within",
wiki_lang != ets_lang ~ "across")) %>%
left_join(meta_mean_score,
by =c("ets_lang" = "L1_code", "score_group" = "score_bin"))
ggplot(native_corrs, aes(wiki_lang, ets_lang, fill = corr)) +
facet_grid(~score_group) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", midpoint = 0) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
native_corrs %>%
lme4::lmer(corr ~ group * score_group + (group +score_group|ets_lang) + (group + score_group|wiki_lang), data = .) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: corr ~ group * score_group + (group + score_group | ets_lang) +
## (group + score_group | wiki_lang)
## Data: .
##
## REML criterion at convergence: -7899.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.2227 -0.4464 0.0103 0.4915 7.7462
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ets_lang (Intercept) 9.135e-04 0.030225
## groupwithin 2.455e-06 0.001567 -0.98
## score_grouplow 4.539e-03 0.067374 -0.96 0.99
## wiki_lang (Intercept) 5.033e-03 0.070947
## groupwithin 3.617e-06 0.001902 0.07
## score_grouplow 4.718e-05 0.006869 -0.91 -0.48
## Residual 1.877e-04 0.013701
## Number of obs: 1458, groups: ets_lang, 27; wiki_lang, 27
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.334505 0.014850 22.525
## groupwithin 0.008684 0.002728 3.183
## score_grouplow -0.024621 0.013054 -1.886
## groupwithin:score_grouplow -0.009333 0.003800 -2.456
##
## Correlation of Fixed Effects:
## (Intr) grpwth scr_gr
## groupwithin -0.040
## score_grplw -0.459 0.110
## grpwthn:sc_ 0.005 -0.696 -0.011
native_corrs %>%
lme4::lmer(corr ~ group * score_group + mean_score +
(group +score_group|ets_lang) + (group + score_group|wiki_lang), data = .) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: corr ~ group * score_group + mean_score + (group + score_group |
## ets_lang) + (group + score_group | wiki_lang)
## Data: .
##
## REML criterion at convergence: -7901.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.1724 -0.4425 0.0051 0.4859 7.6918
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ets_lang (Intercept) 1.248e-04 0.011173
## groupwithin 2.129e-06 0.001459 -0.20
## score_grouplow 2.148e-03 0.046342 -0.35 0.99
## wiki_lang (Intercept) 5.034e-03 0.070950
## groupwithin 3.630e-06 0.001905 -0.05
## score_grouplow 4.718e-05 0.006869 -0.91 -0.37
## Residual 1.877e-04 0.013701
## Number of obs: 1458, groups: ets_lang, 27; wiki_lang, 27
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -0.149808 0.042892 -3.493
## groupwithin 0.008684 0.002726 3.185
## score_grouplow 0.212560 0.021844 9.731
## mean_score 0.130525 0.010942 11.929
## groupwithin:score_grouplow -0.009333 0.003800 -2.456
##
## Correlation of Fixed Effects:
## (Intr) grpwth scr_gr mn_scr
## groupwithin -0.005
## score_grplw -0.886 0.043
## mean_score -0.947 0.000 0.910
## grpwthn:sc_ 0.002 -0.697 -0.006 0.000
Some evidence consistent with this. (weak)
Normalizing by row and column means
wiki_means <- native_corrs %>%
group_by(wiki_lang) %>%
summarize(mean_wiki_corr = mean(corr))
# column means
n = native_corrs %>%
left_join(wiki_means) %>%
mutate(corr_normalized = mean_wiki_corr - corr)
ggplot(n, aes(wiki_lang, ets_lang, fill = corr_normalized)) +
facet_grid(~score_group) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", midpoint = 0) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
n %>%
lme4::lmer(corr_normalized ~ group * score_group + mean_score + (group +score_group|ets_lang) + (group + score_group|wiki_lang), data = .) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: corr_normalized ~ group * score_group + mean_score + (group +
## score_group | ets_lang) + (group + score_group | wiki_lang)
## Data: .
##
## REML criterion at convergence: -8088.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.7304 -0.4902 -0.0057 0.4601 4.3809
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ets_lang (Intercept) 1.253e-04 0.011195
## groupwithin 3.212e-06 0.001792 -0.27
## score_grouplow 2.155e-03 0.046426 -0.35 1.00
## wiki_lang (Intercept) 1.162e-05 0.003409
## groupwithin 1.866e-06 0.001366 1.00
## score_grouplow 4.708e-05 0.006862 -1.00 -1.00
## Residual 1.843e-04 0.013577
## Number of obs: 1458, groups: ets_lang, 27; wiki_lang, 27
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.470197 0.040641 11.569
## groupwithin -0.008684 0.002698 -3.219
## score_grouplow -0.211603 0.021839 -9.689
## mean_score -0.129999 0.010935 -11.888
## groupwithin:score_grouplow 0.009333 0.003765 2.479
##
## Correlation of Fixed Effects:
## (Intr) grpwth scr_gr mn_scr
## groupwithin -0.003
## score_grplw -0.917 0.051
## mean_score -0.998 0.000 0.910
## grpwthn:sc_ 0.002 -0.698 -0.006 0.000
# row means
ets_means <- native_corrs %>%
group_by(ets_lang) %>%
summarize(mean_ets_corr = mean(corr))
m = native_corrs %>%
left_join(ets_means) %>%
mutate(corr_normalized = mean_ets_corr - corr)
ggplot(m, aes(wiki_lang, ets_lang, fill = corr_normalized)) +
facet_grid(~score_group) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", midpoint = 0) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
m %>%
lme4::lmer(corr_normalized ~ group * score_group + mean_score +
(group +score_group|ets_lang) +
(group + score_group|wiki_lang), data = .) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: corr_normalized ~ group * score_group + mean_score + (group +
## score_group | ets_lang) + (group + score_group | wiki_lang)
## Data: .
##
## REML criterion at convergence: -7995.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -7.8868 -0.4832 -0.0125 0.4516 4.2062
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ets_lang (Intercept) 9.899e-04 0.0314629
## groupwithin 1.253e-06 0.0011193 -1.00
## score_grouplow 4.289e-03 0.0654888 -1.00 1.00
## wiki_lang (Intercept) 5.034e-03 0.0709521
## groupwithin 6.725e-08 0.0002593 -1.00
## score_grouplow 4.702e-05 0.0068573 -0.91 0.91
## Residual 1.846e-04 0.0135872
## Number of obs: 1458, groups: ets_lang, 27; wiki_lang, 27
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.030148 0.018522 1.628
## groupwithin -0.008684 0.002674 -3.248
## score_grouplow 0.003901 0.013778 0.283
## mean_score -0.011403 0.002949 -3.867
## groupwithin:score_grouplow 0.009333 0.003768 2.477
##
## Correlation of Fixed Effects:
## (Intr) grpwth scr_gr mn_scr
## groupwithin -0.045
## score_grplw -0.594 0.082
## mean_score -0.591 0.000 0.389
## grpwthn:sc_ 0.004 -0.705 -0.010 0.000
## convergence code: 0
## unable to evaluate scaled gradient
## Model failed to converge: degenerate Hessian with 1 negative eigenvalues
bad_langs2 <- c("VIE", "GER" ,"IBO" ,"YOR","THA" ,"URD" ,"FAS") # these langs don't have all words
native_corrs <- read_csv("score_pairwise_corrs_no_stops_native_langs_H.csv", col_names = F) %>%
select(X1, X2, X3, X4) %>%
rename(wiki_lang = X1,
ets_lang = X2,
score_group = X3,
corr = X4) %>%
mutate(group = case_when(wiki_lang == ets_lang ~ "within",
wiki_lang != ets_lang ~ "across")) %>%
filter(score_group == "high") %>%
filter(!(wiki_lang %in% bad_langs2)) %>%
filter(!(ets_lang %in% bad_langs2)) %>%
left_join(meta_mean_score, by =c("ets_lang" = "L1_code", "score_group" = "score_bin"))
ggplot(native_corrs, aes(wiki_lang, ets_lang, fill = corr)) +
facet_grid(~score_group) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", midpoint = 0) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
native_corrs %>%
lme4::lmer(corr ~ group +(group |ets_lang) + (group |wiki_lang), data = .) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: corr ~ group + (group | ets_lang) + (group | wiki_lang)
## Data: .
##
## REML criterion at convergence: -5067.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.1327 -0.5212 -0.0160 0.5072 6.9252
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ets_lang (Intercept) 3.647e-04 0.019097
## groupwithin 1.921e-06 0.001386 1.00
## wiki_lang (Intercept) 8.206e-04 0.028646
## groupwithin 7.005e-05 0.008370 -1.00
## Residual 6.052e-05 0.007780
## Number of obs: 784, groups: ets_lang, 28; wiki_lang, 28
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.098175 0.006513 15.075
## groupwithin 0.003899 0.002194 1.777
##
## Correlation of Fixed Effects:
## (Intr)
## groupwithin -0.539
native_corrs %>%
lme4::lmer(corr ~ group + mean_score +
(group |ets_lang) + (group |wiki_lang), data = .) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## corr ~ group + mean_score + (group | ets_lang) + (group | wiki_lang)
## Data: .
##
## REML criterion at convergence: -5116.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.2203 -0.5315 -0.0157 0.5025 6.9426
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ets_lang (Intercept) 4.180e-05 0.0064654
## groupwithin 2.808e-07 0.0005299 1.00
## wiki_lang (Intercept) 8.208e-04 0.0286498
## groupwithin 6.129e-05 0.0078289 -1.00
## Residual 6.057e-05 0.0077827
## Number of obs: 784, groups: ets_lang, 28; wiki_lang, 28
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -0.233395 0.024175 -9.654
## groupwithin 0.003899 0.002108 1.850
## mean_score 0.089220 0.006331 14.093
##
## Correlation of Fixed Effects:
## (Intr) grpwth
## groupwithin -0.156
## mean_score -0.973 0.000
wiki_means <- native_corrs %>%
group_by(wiki_lang) %>%
summarize(mean_wiki_corr = mean(corr))
# column means
n = native_corrs %>%
left_join(wiki_means) %>%
mutate(corr_normalized = mean_wiki_corr - corr)
ggplot(n, aes(wiki_lang, ets_lang, fill = corr_normalized)) +
facet_grid(~score_group) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", midpoint = 0) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
n %>%
lme4::lmer(corr_normalized ~ group + mean_score + (group |ets_lang) + (group |wiki_lang), data = .) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## corr_normalized ~ group + mean_score + (group | ets_lang) + (group |
## wiki_lang)
## Data: .
##
## REML criterion at convergence: -5283.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.9415 -0.4938 0.0046 0.5315 4.3348
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ets_lang (Intercept) 4.189e-05 0.0064721
## groupwithin 2.525e-06 0.0015889 1.00
## wiki_lang (Intercept) 5.993e-08 0.0002448
## groupwithin 4.684e-05 0.0068440 -1.00
## Residual 5.881e-05 0.0076691
## Number of obs: 784, groups: ets_lang, 28; wiki_lang, 28
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.335703 0.023402 14.345
## groupwithin -0.003899 0.001985 -1.964
## mean_score -0.090295 0.006288 -14.360
##
## Correlation of Fixed Effects:
## (Intr) grpwth
## groupwithin 0.005
## mean_score -0.999 0.000
# row means
ets_means <- native_corrs %>%
group_by(ets_lang) %>%
summarize(mean_ets_corr = mean(corr))
m = native_corrs %>%
left_join(ets_means) %>%
mutate(corr_normalized = mean_ets_corr - corr)
ggplot(m, aes(wiki_lang, ets_lang, fill = corr_normalized)) +
facet_grid(~score_group) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", midpoint = 0) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
m %>%
lme4::lmer(corr_normalized ~ group + mean_score +
(group |ets_lang) +
(group |wiki_lang), data = .) %>%
summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula:
## corr_normalized ~ group + mean_score + (group | ets_lang) + (group |
## wiki_lang)
## Data: .
##
## REML criterion at convergence: -5220.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.9740 -0.4982 0.0029 0.5333 4.2840
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## ets_lang (Intercept) 0.000e+00 0.000e+00
## groupwithin 1.697e-16 1.302e-08 NaN
## wiki_lang (Intercept) 8.203e-04 2.864e-02
## groupwithin 5.813e-05 7.624e-03 -1.00
## Residual 5.855e-05 7.652e-03
## Number of obs: 784, groups: ets_lang, 28; wiki_lang, 28
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.0025690 0.0074610 0.344
## groupwithin -0.0038987 0.0020603 -1.892
## mean_score -0.0006538 0.0013797 -0.474
##
## Correlation of Fixed Effects:
## (Intr) grpwth
## groupwithin -0.512
## mean_score -0.687 0.000