1 Packages
library(tidyverse) # data wranging and viz
library(lme4) # freq mixed-effects models
library(ordinal) # freq ordinal mixed models (clmm)
library(emmeans) # post-hoc test
library(brms) # Bayesian mixed models
library(broom) # tidy output for LRT
2 Data
Preprocessing:
<- read_csv("data/woo.csv") %>%
data filter(!problematic) %>%
select(-problematic, -arpa_sound) %>%
mutate(jc = judgement * confidence, # combine judgement and confidence
condition = str_c(system, trained, sep = "_"),
across(everything(), factor),
across(jc, ordered),
across(trained, as.logical))
Preview:
glimpse(data)
Rows: 8,545
Columns: 10
$ item <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ student <fct> 1103211, 1026211, 1112211, 1105211, 1111211, 1111211, 10222…
$ rater <fct> 5, 9, 1, 1, 6, 10, 1, 1, 6, 6, 9, 6, 6, 1, 5, 1, 5, 1, 1, 6…
$ judgement <fct> 1, -1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, 1, 1, 1, -1, -1, 1, …
$ confidence <fct> 3, 4, 3, 3, 4, 3, 2, 4, 4, 4, 1, 4, 4, 4, 1, 3, 2, 4, 4, 4,…
$ sound <fct> 0, 9, 0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9,…
$ system <fct> 3, 1, 3, 3, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 3, 3, 2, 1,…
$ trained <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE,…
$ jc <ord> 3, -4, 3, 3, 4, 3, 2, 4, 4, -4, -1, 4, 4, 4, 1, -3, -2, 4, …
$ condition <fct> 3_TRUE, 1_TRUE, 3_TRUE, 3_TRUE, 2_TRUE, 2_TRUE, 2_TRUE, 2_T…
3 Frequentist models
3.1 Binomial model
Analysis of judgement data whether pronunciation has improved or not. Significant improvement when adding each fixed effect. The interaction of system and trained was significant.
3.1.1 Interaction model
<- glmer(judgement ~ 1 + (1|item) + (1|student) + (1|rater), data = data, family="binomial")
m0 <- glmer(judgement ~ system + (1|item) + (1|student) + (1|rater), data = data, family="binomial")
m1 <- glmer(judgement ~ system + trained + (1|item) + (1|student) + (1|rater), data = data, family="binomial")
m2 <- glmer(judgement ~ system * trained + (1|item) + (1|student) + (1|rater), data = data, family="binomial") m3
Likelihood-ratio test to compare model fit showing evidence for an interaction of system and trained.
<- anova(m0, m1, m2, m3, test = "lrt")
lrt tidy(lrt)
# A tibble: 4 × 9
term npar AIC BIC logLik deviance statistic df p.value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 m0 4 11810. 11839. -5901. 11802. NA NA NA
2 m1 6 11805. 11847. -5897. 11793. 9.28 2 0.00964
3 m2 7 11801. 11851. -5894. 11787. 5.96 1 0.0146
4 m3 9 11798. 11861. -5890. 11780. 7.19 2 0.0275
Pairwise comparisons show that judgement improved was significantly higher for system 1 and 2 compared to 3, with no difference between system 1 and 2, but only in the trained group. No system differences were found for the untrained group.
<- emmeans(m3, ~ system | trained)
marginal pairs(marginal, adjust="tukey")
trained = FALSE:
contrast estimate SE df z.ratio p.value
1 - 2 0.1380 0.0884 Inf 1.561 0.2627
1 - 3 0.0402 0.0881 Inf 0.456 0.8918
2 - 3 -0.0978 0.0883 Inf -1.108 0.5090
trained = TRUE:
contrast estimate SE df z.ratio p.value
1 - 2 0.0392 0.0671 Inf 0.584 0.8286
1 - 3 0.2333 0.0672 Inf 3.472 0.0015
2 - 3 0.1941 0.0670 Inf 2.896 0.0105
Results are given on the log odds ratio (not the response) scale.
P value adjustment: tukey method for comparing a family of 3 estimates
3.1.2 Trained group
Data were also analysed separately for the trained group.
# subset the trained group
<- data %>% filter(trained) data_trained
<- glmer(judgement ~ 1 + (1|item) + (1|student) + (1|rater), data = data_trained, family="binomial")
m0 <- glmer(judgement ~ system + (1|item) + (1|student) + (1|rater), data = data_trained, family="binomial") m1
Likelihood-ratio test shows better fit for the model with system as fixed effect.
<- anova(m0, m1, test = "lrt")
lrt tidy(lrt)
# A tibble: 2 × 9
term npar AIC BIC logLik deviance statistic df p.value
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 m0 4 7491. 7518. -3742. 7483. NA NA NA
2 m1 6 7481. 7521. -3735. 7469. 13.9 2 0.000965
Pairwise comparisons mirror the results above. Judgement improved was higher for system 1 and 2 compared to 3, with no difference between system 1 and 2.
<- emmeans(m1, ~ system)
em pairs(em, adjust="tukey")
contrast estimate SE df z.ratio p.value
1 - 2 0.0387 0.0671 Inf 0.577 0.8325
1 - 3 0.2333 0.0672 Inf 3.473 0.0015
2 - 3 0.1946 0.0670 Inf 2.904 0.0103
Results are given on the log odds ratio (not the response) scale.
P value adjustment: tukey method for comparing a family of 3 estimates
3.2 Ordinal model
Before judgement improved was analysed as binary. Combined with confidence ratings, we analysed judgement improved as ordinal data ranging from -4 (pronunciation definitely not improved) to 4 (pronunciation definitely improved).
3.2.1 Interaction model
We fit ordinal models, incrementally adding predictors.
<- clmm(jc ~ 1 + (1|item) + (1|student) + (1|rater), data = data)
m0_jc <- clmm(jc ~ system + (1|item) + (1|student) + (1|rater), data = data)
m1_jc <- clmm(jc ~ system + trained + (1|item) + (1|student) + (1|rater), data = data)
m2_jc <- clmm(jc ~ system * trained + (1|item) + (1|student) + (1|rater), data = data) m3_jc
Likelihood-ratio test shows improved model fit for adding every predictor, demonstrating evidence for an interaction between system and trained.
<- anova(m0_jc, m1_jc, m2_jc, m3_jc)
lrt as.data.frame(lrt)
no.par AIC logLik LR.stat df Pr(>Chisq)
m0_jc 11 37119.95 -18548.97 NA NA NA
m1_jc 13 37112.79 -18543.40 11.159023 2 0.003774409
m2_jc 14 37106.94 -18539.47 7.846105 1 0.005093031
m3_jc 16 37097.94 -18532.97 13.007541 2 0.001497781
Pairwise comparisons reflect the results above: improvement was found for system 1 and 2 compared to 3 but only for the trained group.
<- emmeans(m3_jc, ~ system | trained)
em pairs(em, adjust="tukey")
trained = FALSE:
contrast estimate SE df z.ratio p.value
1 - 2 0.0270 0.0760 Inf 0.355 0.9330
1 - 3 -0.0426 0.0759 Inf -0.561 0.8409
2 - 3 -0.0695 0.0764 Inf -0.910 0.6337
trained = TRUE:
contrast estimate SE df z.ratio p.value
1 - 2 0.0207 0.0581 Inf 0.356 0.9326
1 - 3 0.2543 0.0585 Inf 4.345 <.0001
2 - 3 0.2336 0.0583 Inf 4.005 0.0002
P value adjustment: tukey method for comparing a family of 3 estimates
3.2.2 Trained group
Data were analysed for the trained group separately.
<- data %>% filter(trained) data_trained
<- clmm(jc ~ 1 + (1|item) + (1|student) + (1|rater), data = data_trained)
m0_jc <- clmm(jc ~ system + (1|item) + (1|student) + (1|rater), data = data_trained) m1_jc
Likelihood-ratio test showed improvement for adding system as predictor.
<- anova(m0_jc, m1_jc)
lrt as.data.frame(lrt)
no.par AIC logLik LR.stat df Pr(>Chisq)
m0_jc 11 23580.98 -11779.49 NA NA NA
m1_jc 13 23562.10 -11768.05 22.87744 2 1.077029e-05
Pairwise comparisons show significant improvement for system 1 and 2 compared to system 3 with no difference between system 1 and 2.
<- emmeans(m1_jc, ~ system)
em pairs(em, adjust="tukey")
contrast estimate SE df z.ratio p.value
1 - 2 0.0185 0.0579 Inf 0.320 0.9452
1 - 3 0.2504 0.0583 Inf 4.292 0.0001
2 - 3 0.2319 0.0581 Inf 3.988 0.0002
P value adjustment: tukey method for comparing a family of 3 estimates
4 Bayesian models
Note, I’m reporting leave-one-out cross validation (LOO-CV) for model comparisons. However, I don’t think this is helpful here and can possibly be ignored. That’s because LOO-CV tends to be conservative for nested models.
4.1 Binomial model
4.1.1 Interaction model
Model specifications are the same as above for the frequentist models. The models were run in separate scripts (see scripts folder) and are only shown here in brief.
# Model specification
<- bf(judgement ~ 1 + (1|item) + (1|student) + (1|rater), family = bernoulli())
m0 <- bf(judgement ~ system + (1|item) + (1|student) + (1|rater), family = bernoulli())
m1 <- bf(judgement ~ system + trained + (1|item) + (1|student) + (1|rater), family = bernoulli())
m2 <- bf(judgement ~ system + trained + (1|item) + (1|student) + (1|rater), family = bernoulli()) m3
The predictor trained was sum-coded so that the coefficent shows the main effect (difference between trained and untrained) independently of system. System was Helmert-coded with SYS1 being the difference between system 3 and both system 1 and 2 combined, and SYS2 being the difference between system 1 and 2; again main effects of system are independently of the predictor trained. Respective interactions were included to, indicated by the colon ‘:’
Evidence in favour of the alternative hypothesis was found for the main effect of trained and the interaction of SYS1 and train. Evidence for other predictors was negligible.
# Hypotheses
<- c("conditionSYS1 = 0",
hs "conditionSYS2 = 0",
"conditionTRAIN = 0",
"conditionSYS1:TRAIN = 0",
"conditionSYS2:TRAIN = 0")
# Calculate and summarise evidence
map_dfr(hs, ~hypothesis(m3, .x)[["hypothesis"]]) %>%
select(Predictor = Hypothesis,
Estimate, Lower = CI.Lower,
Upper = CI.Upper,
H0 = Evid.Ratio) %>%
mutate(H1 = 1/H0,
across(where(is.numeric), round, 2),
across(Predictor, str_remove_all, "\\(condition|\\)| = 0"))
Predictor Estimate Lower Upper H0 H1
1 SYS1 0.36 0.00 0.74 0.79 1.26
2 SYS2 0.17 -0.04 0.39 2.68 0.37
3 TRAIN -0.41 -0.75 -0.09 0.26 3.88
4 SYS1:TRAIN -0.47 -0.84 -0.10 0.21 4.73
5 SYS2:TRAIN 0.10 -0.12 0.32 5.60 0.18
Model comparisons using leave-one-out cross-validation (LOO-CV). The model with the interaction of system and trained showed the highest predictive performance but with a negligible improvement compared to the main effects model.
<- loo(m0, m1, m2, m3) model_comparison
elpd_diff se_diff elpd_loo se_elpd_loo
m3 0.00 0.00 -5891.10 8.92
m2 -2.21 2.64 -5893.31 8.50
m1 -3.18 3.22 -5894.27 8.29
m0 -5.67 4.40 -5896.76 7.72
The posterior was summarised as cell means and 95% probability intervals shown in Figure 4.1 illustrating the improvement for system 1 and 2 compared to 3 in the trained group.
Figure 4.1: Estimated cell means for judgement by system with 95% PIs (probability intervals).
Nested comparisons for system within trained were calculated:
<- emmeans(m3, ~ system | trained)
em contrast(em, "tukey")
trained = FALSE:
contrast estimate lower.HPD upper.HPD
1 - 2 0.1374 -0.0348 0.2982
1 - 3 0.0404 -0.1392 0.2042
2 - 3 -0.0970 -0.2654 0.0739
trained = TRUE:
contrast estimate lower.HPD upper.HPD
1 - 2 0.0406 -0.0936 0.1669
1 - 3 0.2326 0.1053 0.3634
2 - 3 0.1930 0.0619 0.3264
Point estimate displayed: median
Results are given on the log odds ratio (not the response) scale.
HPD interval probability: 0.95
4.1.2 Trained group
Data were analysed separately for the trained group.
Filter trained group data:
<- filter(data, trained) data_trained
Model specifications are summarised in brief. The models were run using a separate script that can be found in the “scripts” folder.
# Model specification
<- bf(judgement ~ 1 + (1|item) + (1|student) + (1|rater), family = bernoulli())
m0 <- bf(judgement ~ system + (1|item) + (1|student) + (1|rater), family = bernoulli()) m1
Model comparisons using leave-one-out cross-validation (LOO-CV). There was a small improvement in predictive performance for the model with system as predictor.
<- loo(m0, m1) model_comparison
elpd_diff se_diff elpd_loo se_elpd_loo
m1 0.00 0.00 -3736.85 7.76
m0 -4.84 3.73 -3741.68 6.85
Evidence for improvement was found for system 3 compared to the baseline system 1 but not for 2.
Note: It probably would make more sense to refit this model with system 3 as baseline or with Helmert contrast as above.
# Hypotheses
<- c("system2 = 0", "system3 = 0")
hs
# Calculate and summarise evidence
map_dfr(hs, ~hypothesis(m1, .x)[["hypothesis"]]) %>%
select(Predictor = Hypothesis,
Estimate, Lower = CI.Lower,
Upper = CI.Upper,
H0 = Evid.Ratio) %>%
mutate(H1 = 1/H0,
across(where(is.numeric), round, 2),
across(Predictor, str_remove_all, "\\(|\\)| = 0"))
Predictor Estimate Lower Upper H0 H1
1 system2 -0.04 -0.18 0.09 24.91 0.04
2 system3 -0.23 -0.36 -0.10 0.07 15.14
The posterior was summarised as cell means and 95% probability intervals shown in Figure 4.2 illustrating the improvement for system 1 and 2 compared to 3.
Figure 4.2: Estimated cell means for judgement by system with 95% PIs (probability intervals). Trained group only.
4.2 Ordinal model
4.2.1 Interaction model
Model specifications are summarised in brief. The models were run using a separate script that can be found in the “scripts” folder.
# Model specification
<- bf(jc ~ 1 + (1|item) + (1|student) + (1|rater), family = cumulative())
m0 <- bf(jc ~ system + (1|item) + (1|student) + (1|rater), family = cumulative())
m1 <- bf(jc ~ system + trained + (1|item) + (1|student) + (1|rater), family = cumulative())
m2 <- bf(jc ~ system * trained + (1|item) + (1|student) + (1|rater), family = cumulative()) m3
The predictor trained was sum-coded so that the coefficent shows the main effect (difference between trained and untrained) independently of system. System was Helmert-coded with SYS1 being the difference between system 3 and both system 1 and 2 combined, and SYS2 being the difference between system 1 and 2; again main effects of system are independently of the predictor trained. Respective interactions were included to, indicated by the colon ‘:’
Evidence in favour of the alternative hypothesis was found for the interaction of SYS1 and train. Evidence for other predictors was negligible.
# Hyotheses
<- c("conditionSYS1 = 0",
hs "conditionSYS2 = 0",
"conditionTRAIN = 0",
"conditionSYS1:TRAIN = 0",
"conditionSYS2:TRAIN = 0")
# Calculate and summarise evidence
map_dfr(hs, ~hypothesis(m3, .x)[["hypothesis"]]) %>%
select(Predictor = Hypothesis,
Estimate, Lower = CI.Lower,
Upper = CI.Upper,
H0 = Evid.Ratio) %>%
mutate(H1 = 1/H0, across(where(is.numeric), round, 2),
across(Predictor, str_remove_all, "\\(condition|\\)| = 0"))
Predictor Estimate Lower Upper H0 H1
1 SYS1 0.38 0.06 0.71 0.93 1.07
2 SYS2 0.05 -0.14 0.23 18.90 0.05
3 TRAIN -0.42 -0.72 -0.12 0.48 2.08
4 SYS1:TRAIN -0.60 -0.92 -0.27 0.04 24.31
5 SYS2:TRAIN 0.00 -0.19 0.20 20.83 0.05
Model comparisons using leave-one-out cross-validation (LOO-CV). The interaction model m3
shows the highest predictive performance but is only negligibly better than the main effects model.
<- loo(m0, m1, m2, m3) model_comparison
elpd_diff se_diff elpd_loo se_elpd_loo
m3 0.00 0.00 -18540.64 21.68
m2 -4.83 3.59 -18545.47 21.35
m1 -6.23 4.14 -18546.87 21.21
m0 -9.92 5.40 -18550.56 20.93
The posterior was summarised as cell means and 95% probability intervals shown in Figure 4.3 illustrating the improvement for system 1 and 2 compared to 3 in the trained group. No differences can be seen in the untrained group. In particular, there is a higher posterior probability to choose response categories larger than 0 (i.e. pronounciation definitely improved) for system 1 and 2 compared to 3; for system 3 there was a larger posterior probability to choose response categories smaller than 0 (i.e. pronounciation definitely did not improved) compared to system 1 and 2.
Figure 4.3: Estimated cell means for confidence improved by response category with 95% PIs (probability intervals). Values show the probability of a rater choosing a response category.
Figure 4.4 summarises the same finding across response categories.
Figure 4.4: Estimated cell means for confidence improved by system with 95% PIs (probability intervals).
Estimated marginal means, as shown in Figure 4.4:
# A tibble: 6 × 5
System Trained Estimate Lower Upper
<chr> <chr> <dbl> <dbl> <dbl>
1 System 1 untrained 0.02 -0.16 0.21
2 System 1 trained 0.38 0.23 0.53
3 System 2 untrained -0.02 -0.21 0.18
4 System 2 trained 0.35 0.2 0.49
5 System 3 untrained 0.09 -0.1 0.27
6 System 3 trained 0 -0.15 0.15
Post-hoc comparisons were calculated. The probability intervals suggest improvement for system 1 and 2 compared to 3 but only in the trained group.
<- emmeans(m3, ~ system | trained)
em contrast(em, "tukey")
trained = FALSE:
contrast estimate lower.HPD upper.HPD
1 - 2 0.0237 -0.1330 0.1653
1 - 3 -0.0440 -0.1866 0.1093
2 - 3 -0.0676 -0.2171 0.0797
trained = TRUE:
contrast estimate lower.HPD upper.HPD
1 - 2 0.0220 -0.0955 0.1325
1 - 3 0.2560 0.1389 0.3705
2 - 3 0.2342 0.1253 0.3547
Point estimate displayed: median
Results are given on the log odds ratio (not the response) scale.
HPD interval probability: 0.95
4.2.2 Trained group
Filter trained group.
<- filter(data, trained) data_trained
Model specifications in brief.
# Model specification
<- bf(jc ~ 1 + (1|item) + (1|student) + (1|rater), family = cumulative())
m0 <- bf(jc ~ system + (1|item) + (1|student) + (1|rater), family = cumulative()) m1
Model comparisons.
# Hypotheses
<- c("system2 = 0", "system3 = 0")
hs
# Calculate and summarise evidence
map_dfr(hs, ~hypothesis(m1, .x)[["hypothesis"]]) %>%
select(Predictor = Hypothesis, Estimate,
Lower = CI.Lower, Upper = CI.Upper,
H0 = Evid.Ratio) %>%
mutate(H1 = 1/H0,
across(where(is.numeric), round, 2),
across(Predictor, str_remove_all, "\\(|\\)| = 0"))
Predictor Estimate Lower Upper H0 H1
1 system2 -0.02 -0.13 0.09 31.54 0.03
2 system3 -0.25 -0.36 -0.14 0.03 28.65
Model comparisons using leave-one-out cross-validation (LOO-CV).
<- loo(m0, m1) model_comparison
elpd_diff se_diff elpd_loo se_elpd_loo
m1 0.00 0.00 -11775.81 17.46
m0 -9.71 4.83 -11785.53 16.76
Posterior summary as above.
Figure 4.5: Estimated cell means for confidence improved by response category and system with 95% PIs (probability intervals). Trained group only.
Figure 4.6: Estimated cell means for confidence improved by system with 95% PIs (probability intervals). Trained group only.
Estimated marginal means:
# A tibble: 3 × 4
System Estimate Lower Upper
<fct> <dbl> <dbl> <dbl>
1 1 0.37 0.22 0.53
2 2 0.34 0.18 0.5
3 3 0 -0.16 0.15