QUESTION: Are essays that scored higher closer to native English speakers? Predict that pairwise word distances for high scoring essays should be closer to English than low scoring essays.

Here are the differences in correlation between ETS and English wikpedia, comparing high ETS to low ETS (positive => h > l).

0.0.1 Diff

corrs <- read_csv("score_pairwise_corrs_no_stops.csv", col_names = F) %>%
  select(X1, X2, X3) %>%
  rename(lang = X1,
         score_group = X2, 
         corr = X3) %>%
  spread(score_group, corr) %>%
  mutate(diff = high-low)

ggplot(corrs, aes(x = reorder(lang,-diff), lang, y = diff)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.5), fill = "grey")+
  xlab("L2 language") +
  ylab("difference in correlaiton between high and \nlow scoring essays (h-l)") +
  ggtitle("Diff") +
  theme_classic()

t.test(corrs$diff)
## 
##  One Sample t-test
## 
## data:  corrs$diff
## t = 3.5617, df = 34, p-value = 0.001114
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.01659506 0.06069463
## sample estimates:
##  mean of x 
## 0.03864484

Broadly this is true. Higher scoring essays - on average - are more correlated with English than lower scoring essays. But it’s not universally true.

0.0.2 Low vs. High:

ggplot(corrs, aes(x = reorder(lang,-low),  y = low)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.5), fill = "lightblue")+
  xlab("L2 language") +
  ylab("Correlation for low scoring essays") +
  ggtitle("Low scoring essays") +
  theme_classic()

ggplot(corrs, aes(x = reorder(lang,-high),  y = high)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.5), fill = "pink")+
  xlab("L2 language") +
  ylab("Correlation for low scoring essays") +
  ggtitle("High scoring essays") +
  theme_classic()

There’s something funny going on here….low and high are correlations are themselves correlated with eachother. Languages that have high low scoring essays have low high scoring essays…..why would this be??

ggplot(corrs, aes(x = high, y = low)) +
  geom_text(aes(label = lang))  +
  theme_classic() +
  ggtitle("Correlation between low and high correlations") +
  geom_smooth(method = "lm")

0.1 Compared to native (low vs. high)

meta_mean_score <- read_csv("../../../data/raw/merged_metadata.csv") %>%
  mutate(essay_id = as.character(essay_id),
         score_bin = ifelse(score < 3, "low", "high")) %>%
  select(essay_id, L1_code, score_bin,  score)  %>%
  group_by(score_bin, L1_code) %>%
  summarize(mean_score = mean(score))
bad_langs <- c("VIE", "GER" ,"IBO" ,"YOR","THA" ,"URD" ,"FAS","TGL") # these langs don't have all words
native_corrs <- read_csv("score_pairwise_corrs_no_stops_native_langs.csv", col_names = F) %>%
    select(X1, X2, X3, X4) %>%
  rename(wiki_lang = X1,
         ets_lang = X2,
         score_group = X3, 
         corr = X4)  %>%
  filter(!(wiki_lang %in% bad_langs)) %>%
  filter(!(ets_lang %in% bad_langs)) %>%
  mutate(group = case_when(wiki_lang == ets_lang ~ "within",
                           wiki_lang != ets_lang ~ "across")) %>%
  left_join(meta_mean_score, 
            by =c("ets_lang" = "L1_code", "score_group" = "score_bin"))

ggplot(native_corrs, aes(wiki_lang, ets_lang, fill = corr)) +
  facet_grid(~score_group) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", midpoint = 0) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

native_corrs %>%
  lme4::lmer(corr ~ group * score_group  + (group +score_group|ets_lang) + (group + score_group|wiki_lang), data = .) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: corr ~ group * score_group + (group + score_group | ets_lang) +  
##     (group + score_group | wiki_lang)
##    Data: .
## 
## REML criterion at convergence: -7899.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.2227 -0.4464  0.0103  0.4915  7.7462 
## 
## Random effects:
##  Groups    Name           Variance  Std.Dev. Corr       
##  ets_lang  (Intercept)    9.135e-04 0.030225            
##            groupwithin    2.455e-06 0.001567 -0.98      
##            score_grouplow 4.539e-03 0.067374 -0.96  0.99
##  wiki_lang (Intercept)    5.033e-03 0.070947            
##            groupwithin    3.617e-06 0.001902  0.07      
##            score_grouplow 4.718e-05 0.006869 -0.91 -0.48
##  Residual                 1.877e-04 0.013701            
## Number of obs: 1458, groups:  ets_lang, 27; wiki_lang, 27
## 
## Fixed effects:
##                             Estimate Std. Error t value
## (Intercept)                 0.334505   0.014850  22.525
## groupwithin                 0.008684   0.002728   3.183
## score_grouplow             -0.024621   0.013054  -1.886
## groupwithin:score_grouplow -0.009333   0.003800  -2.456
## 
## Correlation of Fixed Effects:
##             (Intr) grpwth scr_gr
## groupwithin -0.040              
## score_grplw -0.459  0.110       
## grpwthn:sc_  0.005 -0.696 -0.011
native_corrs %>%
  lme4::lmer(corr ~ group * score_group + mean_score + 
               (group +score_group|ets_lang) + (group + score_group|wiki_lang), data = .) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: corr ~ group * score_group + mean_score + (group + score_group |  
##     ets_lang) + (group + score_group | wiki_lang)
##    Data: .
## 
## REML criterion at convergence: -7901.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.1724 -0.4425  0.0051  0.4859  7.6918 
## 
## Random effects:
##  Groups    Name           Variance  Std.Dev. Corr       
##  ets_lang  (Intercept)    1.248e-04 0.011173            
##            groupwithin    2.129e-06 0.001459 -0.20      
##            score_grouplow 2.148e-03 0.046342 -0.35  0.99
##  wiki_lang (Intercept)    5.034e-03 0.070950            
##            groupwithin    3.630e-06 0.001905 -0.05      
##            score_grouplow 4.718e-05 0.006869 -0.91 -0.37
##  Residual                 1.877e-04 0.013701            
## Number of obs: 1458, groups:  ets_lang, 27; wiki_lang, 27
## 
## Fixed effects:
##                             Estimate Std. Error t value
## (Intercept)                -0.149808   0.042892  -3.493
## groupwithin                 0.008684   0.002726   3.185
## score_grouplow              0.212560   0.021844   9.731
## mean_score                  0.130525   0.010942  11.929
## groupwithin:score_grouplow -0.009333   0.003800  -2.456
## 
## Correlation of Fixed Effects:
##             (Intr) grpwth scr_gr mn_scr
## groupwithin -0.005                     
## score_grplw -0.886  0.043              
## mean_score  -0.947  0.000  0.910       
## grpwthn:sc_  0.002 -0.697 -0.006  0.000

Some evidence consistent with this. (weak)

Normalizing by row and column means

wiki_means <- native_corrs %>%
  group_by(wiki_lang) %>%
  summarize(mean_wiki_corr = mean(corr))

# column means 
n = native_corrs %>%
      left_join(wiki_means) %>%
      mutate(corr_normalized = mean_wiki_corr - corr) 

ggplot(n, aes(wiki_lang, ets_lang, fill = corr_normalized)) +
  facet_grid(~score_group) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", midpoint = 0) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

n %>%
  lme4::lmer(corr_normalized ~ group * score_group  + mean_score + (group +score_group|ets_lang) + (group + score_group|wiki_lang), data = .) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: corr_normalized ~ group * score_group + mean_score + (group +  
##     score_group | ets_lang) + (group + score_group | wiki_lang)
##    Data: .
## 
## REML criterion at convergence: -8088.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.7304 -0.4902 -0.0057  0.4601  4.3809 
## 
## Random effects:
##  Groups    Name           Variance  Std.Dev. Corr       
##  ets_lang  (Intercept)    1.253e-04 0.011195            
##            groupwithin    3.212e-06 0.001792 -0.27      
##            score_grouplow 2.155e-03 0.046426 -0.35  1.00
##  wiki_lang (Intercept)    1.162e-05 0.003409            
##            groupwithin    1.866e-06 0.001366  1.00      
##            score_grouplow 4.708e-05 0.006862 -1.00 -1.00
##  Residual                 1.843e-04 0.013577            
## Number of obs: 1458, groups:  ets_lang, 27; wiki_lang, 27
## 
## Fixed effects:
##                             Estimate Std. Error t value
## (Intercept)                 0.470197   0.040641  11.569
## groupwithin                -0.008684   0.002698  -3.219
## score_grouplow             -0.211603   0.021839  -9.689
## mean_score                 -0.129999   0.010935 -11.888
## groupwithin:score_grouplow  0.009333   0.003765   2.479
## 
## Correlation of Fixed Effects:
##             (Intr) grpwth scr_gr mn_scr
## groupwithin -0.003                     
## score_grplw -0.917  0.051              
## mean_score  -0.998  0.000  0.910       
## grpwthn:sc_  0.002 -0.698 -0.006  0.000
# row means
ets_means <- native_corrs %>%
  group_by(ets_lang) %>%
  summarize(mean_ets_corr = mean(corr))

m = native_corrs %>%
      left_join(ets_means) %>%
      mutate(corr_normalized = mean_ets_corr - corr) 

ggplot(m, aes(wiki_lang, ets_lang, fill = corr_normalized)) +
  facet_grid(~score_group) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", midpoint = 0) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

m %>%
  lme4::lmer(corr_normalized ~ group * score_group  + mean_score + 
               (group +score_group|ets_lang) + 
               (group + score_group|wiki_lang), data = .) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: corr_normalized ~ group * score_group + mean_score + (group +  
##     score_group | ets_lang) + (group + score_group | wiki_lang)
##    Data: .
## 
## REML criterion at convergence: -7995.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -7.8868 -0.4832 -0.0125  0.4516  4.2062 
## 
## Random effects:
##  Groups    Name           Variance  Std.Dev.  Corr       
##  ets_lang  (Intercept)    9.899e-04 0.0314629            
##            groupwithin    1.253e-06 0.0011193 -1.00      
##            score_grouplow 4.289e-03 0.0654888 -1.00  1.00
##  wiki_lang (Intercept)    5.034e-03 0.0709521            
##            groupwithin    6.725e-08 0.0002593 -1.00      
##            score_grouplow 4.702e-05 0.0068573 -0.91  0.91
##  Residual                 1.846e-04 0.0135872            
## Number of obs: 1458, groups:  ets_lang, 27; wiki_lang, 27
## 
## Fixed effects:
##                             Estimate Std. Error t value
## (Intercept)                 0.030148   0.018522   1.628
## groupwithin                -0.008684   0.002674  -3.248
## score_grouplow              0.003901   0.013778   0.283
## mean_score                 -0.011403   0.002949  -3.867
## groupwithin:score_grouplow  0.009333   0.003768   2.477
## 
## Correlation of Fixed Effects:
##             (Intr) grpwth scr_gr mn_scr
## groupwithin -0.045                     
## score_grplw -0.594  0.082              
## mean_score  -0.591  0.000  0.389       
## grpwthn:sc_  0.004 -0.705 -0.010  0.000
## convergence code: 0
## unable to evaluate scaled gradient
## Model failed to converge: degenerate  Hessian with 1 negative eigenvalues

0.2 Compared to native, high only (more words)

bad_langs2 <- c("VIE", "GER" ,"IBO" ,"YOR","THA" ,"URD" ,"FAS") # these langs don't have all words

native_corrs <- read_csv("score_pairwise_corrs_no_stops_native_langs_H.csv", col_names = F) %>%
    select(X1, X2, X3, X4) %>%
  rename(wiki_lang = X1,
         ets_lang = X2,
         score_group = X3, 
         corr = X4)  %>%
    mutate(group = case_when(wiki_lang == ets_lang ~ "within",
                           wiki_lang != ets_lang ~ "across")) %>%
  filter(score_group == "high") %>%
  filter(!(wiki_lang %in% bad_langs2)) %>%
  filter(!(ets_lang %in% bad_langs2)) %>%
  left_join(meta_mean_score, by =c("ets_lang" = "L1_code", "score_group" = "score_bin"))

ggplot(native_corrs, aes(wiki_lang, ets_lang, fill = corr)) +
  facet_grid(~score_group) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", midpoint = 0) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

native_corrs %>%
  lme4::lmer(corr ~ group +(group |ets_lang) + (group |wiki_lang), data = .) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: corr ~ group + (group | ets_lang) + (group | wiki_lang)
##    Data: .
## 
## REML criterion at convergence: -5067.2
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.1327 -0.5212 -0.0160  0.5072  6.9252 
## 
## Random effects:
##  Groups    Name        Variance  Std.Dev. Corr 
##  ets_lang  (Intercept) 3.647e-04 0.019097      
##            groupwithin 1.921e-06 0.001386 1.00 
##  wiki_lang (Intercept) 8.206e-04 0.028646      
##            groupwithin 7.005e-05 0.008370 -1.00
##  Residual              6.052e-05 0.007780      
## Number of obs: 784, groups:  ets_lang, 28; wiki_lang, 28
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 0.098175   0.006513  15.075
## groupwithin 0.003899   0.002194   1.777
## 
## Correlation of Fixed Effects:
##             (Intr)
## groupwithin -0.539
native_corrs %>%
  lme4::lmer(corr ~ group + mean_score + 
               (group |ets_lang) + (group |wiki_lang), data = .) %>%
  
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## corr ~ group + mean_score + (group | ets_lang) + (group | wiki_lang)
##    Data: .
## 
## REML criterion at convergence: -5116.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.2203 -0.5315 -0.0157  0.5025  6.9426 
## 
## Random effects:
##  Groups    Name        Variance  Std.Dev.  Corr 
##  ets_lang  (Intercept) 4.180e-05 0.0064654      
##            groupwithin 2.808e-07 0.0005299 1.00 
##  wiki_lang (Intercept) 8.208e-04 0.0286498      
##            groupwithin 6.129e-05 0.0078289 -1.00
##  Residual              6.057e-05 0.0077827      
## Number of obs: 784, groups:  ets_lang, 28; wiki_lang, 28
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept) -0.233395   0.024175  -9.654
## groupwithin  0.003899   0.002108   1.850
## mean_score   0.089220   0.006331  14.093
## 
## Correlation of Fixed Effects:
##             (Intr) grpwth
## groupwithin -0.156       
## mean_score  -0.973  0.000
wiki_means <- native_corrs %>%
  group_by(wiki_lang) %>%
  summarize(mean_wiki_corr = mean(corr))

# column means 
n = native_corrs %>%
      left_join(wiki_means) %>%
      mutate(corr_normalized = mean_wiki_corr - corr) 

ggplot(n, aes(wiki_lang, ets_lang, fill = corr_normalized)) +
  facet_grid(~score_group) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", midpoint = 0) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

n %>%
  lme4::lmer(corr_normalized ~ group + mean_score + (group |ets_lang) + (group |wiki_lang), data = .) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## corr_normalized ~ group + mean_score + (group | ets_lang) + (group |  
##     wiki_lang)
##    Data: .
## 
## REML criterion at convergence: -5283.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.9415 -0.4938  0.0046  0.5315  4.3348 
## 
## Random effects:
##  Groups    Name        Variance  Std.Dev.  Corr 
##  ets_lang  (Intercept) 4.189e-05 0.0064721      
##            groupwithin 2.525e-06 0.0015889 1.00 
##  wiki_lang (Intercept) 5.993e-08 0.0002448      
##            groupwithin 4.684e-05 0.0068440 -1.00
##  Residual              5.881e-05 0.0076691      
## Number of obs: 784, groups:  ets_lang, 28; wiki_lang, 28
## 
## Fixed effects:
##              Estimate Std. Error t value
## (Intercept)  0.335703   0.023402  14.345
## groupwithin -0.003899   0.001985  -1.964
## mean_score  -0.090295   0.006288 -14.360
## 
## Correlation of Fixed Effects:
##             (Intr) grpwth
## groupwithin  0.005       
## mean_score  -0.999  0.000
# row means
ets_means <- native_corrs %>%
  group_by(ets_lang) %>%
  summarize(mean_ets_corr = mean(corr))

m = native_corrs %>%
      left_join(ets_means) %>%
      mutate(corr_normalized = mean_ets_corr - corr) 

ggplot(m, aes(wiki_lang, ets_lang, fill = corr_normalized)) +
  facet_grid(~score_group) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", high = "red", midpoint = 0) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

m %>%
  lme4::lmer(corr_normalized ~ group + mean_score + 
               (group |ets_lang) + 
               (group |wiki_lang), data = .) %>%
  summary()
## Linear mixed model fit by REML ['lmerMod']
## Formula: 
## corr_normalized ~ group + mean_score + (group | ets_lang) + (group |  
##     wiki_lang)
##    Data: .
## 
## REML criterion at convergence: -5220.7
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -6.9740 -0.4982  0.0029  0.5333  4.2840 
## 
## Random effects:
##  Groups    Name        Variance  Std.Dev.  Corr 
##  ets_lang  (Intercept) 0.000e+00 0.000e+00      
##            groupwithin 1.697e-16 1.302e-08  NaN 
##  wiki_lang (Intercept) 8.203e-04 2.864e-02      
##            groupwithin 5.813e-05 7.624e-03 -1.00
##  Residual              5.855e-05 7.652e-03      
## Number of obs: 784, groups:  ets_lang, 28; wiki_lang, 28
## 
## Fixed effects:
##               Estimate Std. Error t value
## (Intercept)  0.0025690  0.0074610   0.344
## groupwithin -0.0038987  0.0020603  -1.892
## mean_score  -0.0006538  0.0013797  -0.474
## 
## Correlation of Fixed Effects:
##             (Intr) grpwth
## groupwithin -0.512       
## mean_score  -0.687  0.000