This is the analysis of the GOAT vowel that Kathleen McCarthy and I did for our UKLVC presentation.
First, load packages:
library(tidyverse)
library(brms)
library(sjstats)
library(bayesplot)
Then load data:
data <- read_csv("data/tidied_data.csv")
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
.default = col_double(),
participant = [31mcol_character()[39m,
vowel = [31mcol_character()[39m,
word = [31mcol_character()[39m,
task = [31mcol_character()[39m,
fol.phonOLOG.seg = [31mcol_character()[39m,
fol.PHONETIC.seg = [31mcol_character()[39m,
age = [31mcol_character()[39m,
sex = [31mcol_character()[39m,
rol.var = [31mcol_character()[39m,
face.l = [31mcol_character()[39m,
price.l = [31mcol_character()[39m,
fol_seg = [31mcol_character()[39m
)
See spec(...) for full column specifications.
str(data)
Classes ‘spec_tbl_df’, ‘tbl_df’, ‘tbl’ and 'data.frame': 11641 obs. of 63 variables:
$ X1 : num 1 2 3 4 5 6 7 8 9 10 ...
$ participant : chr "Ali" "Ali" "Ali" "Ali" ...
$ vowel : chr "face" "face" "face" "face" ...
$ word : chr "AGE" "AGES" "AGES" "AWAY" ...
$ sound_start : num 1771 1853 288 1998 116 ...
$ sound_end : num 1771 1853 288 1998 116 ...
$ task : chr "soc.int" "soc.int" "soc.int" "soc.int" ...
$ F1_20 : num 563 428 410 451 539 ...
$ F1_35 : num 472 449 382 405 510 ...
$ F1_50 : num 454 409 392 402 439 ...
$ F1_65 : num 471 416 384 457 481 ...
$ F1_80 : num 439 403 383 422 456 ...
$ F2_20 : num 2147 2104 2140 1644 1882 ...
$ F2_35 : num 2083 2178 2188 1678 1932 ...
$ F2_50 : num 2043 2163 2216 1654 1997 ...
$ F2_65 : num 2006 2242 2202 1616 1994 ...
$ F2_80 : num 1990 2220 2255 1589 2023 ...
$ fol.phonOLOG.seg: chr "dÊ’" "dÊ’" "dÊ’" "w" ...
$ fol.PHONETIC.seg: chr "dÊ’" "dÊ’" "dÊ’" "w" ...
$ lex.stress : num 1 1 1 1 1 1 1 1 1 1 ...
$ age : chr "adolescent" "adolescent" "adolescent" "adolescent" ...
$ sex : chr "M" "M" "M" "M" ...
$ meanF1 : num 480 421 390 427 485 ...
$ meanF2 : num 2054 2181 2200 1636 1966 ...
$ meanFleeceF1 : num 375 375 375 375 375 ...
$ meanFleeceF2 : num 2128 2128 2128 2128 2128 ...
$ meanTRAPF1 : num 706 706 706 706 706 ...
$ u_F1 : num 375 375 375 375 375 ...
$ u_F2 : num 375 375 375 375 375 ...
$ trapF2 : num 1252 1252 1252 1252 1252 ...
$ S_F1 : num 485 485 485 485 485 ...
$ S_F2 : num 1252 1252 1252 1252 1252 ...
$ normF1_20 : num 1.16 0.881 0.845 0.928 1.111 ...
$ normF1_35 : num 0.971 0.925 0.787 0.834 1.05 ...
$ normF1_50 : num 0.935 0.843 0.808 0.828 0.903 ...
$ normF1_65 : num 0.971 0.856 0.79 0.942 0.991 ...
$ normF1_80 : num 0.904 0.83 0.789 0.869 0.938 ...
$ normF2_20 : num 1.72 1.68 1.71 1.31 1.5 ...
$ normF2_35 : num 1.66 1.74 1.75 1.34 1.54 ...
$ normF2_50 : num 1.63 1.73 1.77 1.32 1.6 ...
$ normF2_65 : num 1.6 1.79 1.76 1.29 1.59 ...
$ normF2_80 : num 1.59 1.77 1.8 1.27 1.62 ...
$ duration : num 0.1935 0.1485 0.1199 0.0772 0.1354 ...
$ changeF1 : num -124.5 -24.8 -27.3 -28.8 -83.9 ...
$ normChangeF1 : num -0.2564 -0.051 -0.0562 -0.0593 -0.1728 ...
$ changeF2 : num -156.3 116.4 115.6 -54.3 140.3 ...
$ normChangeF2 : num -0.1249 0.093 0.0924 -0.0434 0.1121 ...
$ VL : num 199.8 119 118.8 61.4 163.4 ...
$ normVL : num 0.2852 0.1061 0.1081 0.0735 0.206 ...
$ normVSL1 : num 0.1953 0.0736 0.0691 0.098 0.0734 ...
$ normVSL2 : num 0.0483 0.0828 0.0312 0.0206 0.155 ...
$ normVSL3 : num 0.0461 0.0646 0.0213 0.1182 0.0877 ...
$ normVSL4 : num 0.0681 0.0313 0.0428 0.0761 0.0574 ...
$ VSL1 : num 111.5 76.9 55.4 57.3 58.2 ...
$ VSL2 : num 43.7 42.4 30.6 24.8 95.8 ...
$ VSL3 : num 40.7 79.5 16.9 67.2 42.6 ...
$ VSL4 : num 36.1 25.4 53.5 44.2 38.3 ...
$ TrajLength : num 232 224 156 193 235 ...
$ norm_TL : num 0.358 0.252 0.164 0.313 0.373 ...
$ rol.var : chr "n.a." "n.a." "n.a." "n.a." ...
$ face.l : chr "fine" "fine" "fine" "fine" ...
$ price.l : chr "n.a." "n.a." "n.a." "n.a." ...
$ fol_seg : chr "fol_voiced_clus" "fol_voiced_clus" "fol_voiced_clus" "fol_approximant" ...
- attr(*, "spec")=
.. cols(
.. X1 = [32mcol_double()[39m,
.. participant = [31mcol_character()[39m,
.. vowel = [31mcol_character()[39m,
.. word = [31mcol_character()[39m,
.. sound_start = [32mcol_double()[39m,
.. sound_end = [32mcol_double()[39m,
.. task = [31mcol_character()[39m,
.. F1_20 = [32mcol_double()[39m,
.. F1_35 = [32mcol_double()[39m,
.. F1_50 = [32mcol_double()[39m,
.. F1_65 = [32mcol_double()[39m,
.. F1_80 = [32mcol_double()[39m,
.. F2_20 = [32mcol_double()[39m,
.. F2_35 = [32mcol_double()[39m,
.. F2_50 = [32mcol_double()[39m,
.. F2_65 = [32mcol_double()[39m,
.. F2_80 = [32mcol_double()[39m,
.. fol.phonOLOG.seg = [31mcol_character()[39m,
.. fol.PHONETIC.seg = [31mcol_character()[39m,
.. lex.stress = [32mcol_double()[39m,
.. age = [31mcol_character()[39m,
.. sex = [31mcol_character()[39m,
.. meanF1 = [32mcol_double()[39m,
.. meanF2 = [32mcol_double()[39m,
.. meanFleeceF1 = [32mcol_double()[39m,
.. meanFleeceF2 = [32mcol_double()[39m,
.. meanTRAPF1 = [32mcol_double()[39m,
.. u_F1 = [32mcol_double()[39m,
.. u_F2 = [32mcol_double()[39m,
.. trapF2 = [32mcol_double()[39m,
.. S_F1 = [32mcol_double()[39m,
.. S_F2 = [32mcol_double()[39m,
.. normF1_20 = [32mcol_double()[39m,
.. normF1_35 = [32mcol_double()[39m,
.. normF1_50 = [32mcol_double()[39m,
.. normF1_65 = [32mcol_double()[39m,
.. normF1_80 = [32mcol_double()[39m,
.. normF2_20 = [32mcol_double()[39m,
.. normF2_35 = [32mcol_double()[39m,
.. normF2_50 = [32mcol_double()[39m,
.. normF2_65 = [32mcol_double()[39m,
.. normF2_80 = [32mcol_double()[39m,
.. duration = [32mcol_double()[39m,
.. changeF1 = [32mcol_double()[39m,
.. normChangeF1 = [32mcol_double()[39m,
.. changeF2 = [32mcol_double()[39m,
.. normChangeF2 = [32mcol_double()[39m,
.. VL = [32mcol_double()[39m,
.. normVL = [32mcol_double()[39m,
.. normVSL1 = [32mcol_double()[39m,
.. normVSL2 = [32mcol_double()[39m,
.. normVSL3 = [32mcol_double()[39m,
.. normVSL4 = [32mcol_double()[39m,
.. VSL1 = [32mcol_double()[39m,
.. VSL2 = [32mcol_double()[39m,
.. VSL3 = [32mcol_double()[39m,
.. VSL4 = [32mcol_double()[39m,
.. TrajLength = [32mcol_double()[39m,
.. norm_TL = [32mcol_double()[39m,
.. rol.var = [31mcol_character()[39m,
.. face.l = [31mcol_character()[39m,
.. price.l = [31mcol_character()[39m,
.. fol_seg = [31mcol_character()[39m
.. )
data <- data %>% mutate(
X1 = NULL,
participant = factor(participant),
vowel = factor(vowel),
word = factor(word),
task = factor(task),
fol.phonOLOG.seg = factor(fol.phonOLOG.seg),
fol.PHONETIC.seg = factor(fol.PHONETIC.seg),
age = factor(age),
sex = factor(sex),
rol.var = factor(rol.var),
face.l = factor(face.l),
price.l = factor(price.l),
fol_seg = factor(fol_seg)
)
Subset the key vowels:
goat_fleece_lot_foot <- data %>% filter(vowel == "goat" | vowel == "fleece" | vowel=="lot" | vowel=="foot")
Check for really frequent words:
goat_fleece_lot_foot %>% group_by(word) %>%
summarise(N = n()) %>% arrange(desc(N))
OK so there are some pretty frequent words, but nothing like as bad as the PRICE class.
Check distribution of duration:
ggplot(goat_fleece_lot_foot, aes(x = duration, fill = age)) + geom_histogram(bins = 100)
Filter so that duration is not more than 0.75.
goat2 <- goat_fleece_lot_foot %>% filter(duration < 0.75)
Try plotting again
ggplot(goat2, aes(x = duration, fill = age)) + geom_histogram(bins = 100)
Log-transform duration
goat3 <- mutate(goat2,
LogDur = log10(duration))
… and standardize it
goat3 <- mutate(goat3,
Log_dur_z = scale(LogDur))
Recode sex var so that M is the default
goat3$sex <- relevel(goat3$sex, ref = "M")
Set ‘pause’ to be the reference level for following segment:
goat3$fol_seg <- relevel(goat3$fol_seg, ref = "fol_pause")
Select just GOAT:
just_goat <- goat3 %>% filter(vowel=="goat")
Run a model. Not specifying priors - just using brms’s inbuilt ones.
goat_m1 <- brm(normF2_20 ~ Log_dur_z + age*sex + fol_seg + (1|participant) + (1|word), data = just_goat)
Compiling the C++ model
recompiling to avoid crashing R session
Start sampling
SAMPLING FOR MODEL '1e2e842938811e2fe81287f02925f257' NOW (CHAIN 1).
Chain 1:
Chain 1: Gradient evaluation took 0.001022 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 10.22 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1:
Chain 1:
Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 1:
Chain 1: Elapsed Time: 49.6184 seconds (Warm-up)
Chain 1: 22.759 seconds (Sampling)
Chain 1: 72.3773 seconds (Total)
Chain 1:
SAMPLING FOR MODEL '1e2e842938811e2fe81287f02925f257' NOW (CHAIN 2).
Chain 2:
Chain 2: Gradient evaluation took 0.000648 seconds
Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 6.48 seconds.
Chain 2: Adjust your expectations accordingly!
Chain 2:
Chain 2:
Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 2:
Chain 2: Elapsed Time: 52.281 seconds (Warm-up)
Chain 2: 22.4679 seconds (Sampling)
Chain 2: 74.7489 seconds (Total)
Chain 2:
SAMPLING FOR MODEL '1e2e842938811e2fe81287f02925f257' NOW (CHAIN 3).
Chain 3:
Chain 3: Gradient evaluation took 0.00074 seconds
Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 7.4 seconds.
Chain 3: Adjust your expectations accordingly!
Chain 3:
Chain 3:
Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 3:
Chain 3: Elapsed Time: 51.2154 seconds (Warm-up)
Chain 3: 23.105 seconds (Sampling)
Chain 3: 74.3205 seconds (Total)
Chain 3:
SAMPLING FOR MODEL '1e2e842938811e2fe81287f02925f257' NOW (CHAIN 4).
Chain 4:
Chain 4: Gradient evaluation took 0.00064 seconds
Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 6.4 seconds.
Chain 4: Adjust your expectations accordingly!
Chain 4:
Chain 4:
Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 4:
Chain 4: Elapsed Time: 54.5033 seconds (Warm-up)
Chain 4: 31.6893 seconds (Sampling)
Chain 4: 86.1926 seconds (Total)
Chain 4:
#model2 <- brm(normF2_20 ~ Log_dur_z + vowel*age*sex + fol_seg + (1+vowel|participant) + (1|word), data = goat_lot)
Print model summary for the fixed effects (not going into the random effects here!)
tidy_stan(goat_m1, prob=0.89, type="fixed", digits=4)
longer object length is not a multiple of shorter object length
[34m
# Summary Statistics of Stan-Model
[39m estimate std.error HDI(89%) ratio rhat mcse
Intercept 0.8770 0.0320 [ 0.8289 0.9353] 0.0908 1.0044 0.0017
Log_dur_z -0.0310 0.0032 [-0.0366 -0.0262] 1.3140 0.9996 0.0000
Log_dur_z -0.0310 0.0032 [-0.0366 -0.0262] 1.3140 0.9996 0.0000
agechild 0.1425 0.0621 [ 0.0452 0.2405] 0.1110 1.0066 0.0030
sexF 0.1816 0.0593 [ 0.0877 0.2791] 0.1503 1.0017 0.0024
fol_segfol_approximant -0.0052 0.0117 [-0.0243 0.0131] 0.6101 1.0004 0.0002
fol_segfol_nasal 0.0188 0.0133 [-0.0011 0.0396] 0.4427 1.0004 0.0003
fol_segfol_voiced_C 0.0553 0.0126 [ 0.0334 0.0740] 0.5053 1.0029 0.0003
fol_segfol_voiced_clus 0.0562 0.0332 [ 0.0056 0.1123] 0.5395 1.0003 0.0007
fol_segfol_voiceless_C 0.0106 0.0123 [-0.0086 0.0309] 0.4787 1.0024 0.0003
fol_segfol_voiceless_clus -0.0017 0.0215 [-0.0353 0.0310] 0.5779 1.0013 0.0004
fol_segother 0.0109 0.0102 [-0.0051 0.0273] 0.4519 1.0012 0.0002
agechild.sexF -0.2062 0.0910 [-0.3519 -0.0611] 0.1438 1.0066 0.0039
#tidy_stan(model2, prob=0.89, type="fixed", digits=4)
Do equivalence test - this basically does NHST but in a Bayesian framework. It looks at whether the posteriors on the different coefficients fall within the ‘Region of Practical Equivalence’, i.e. is there a significant difference? Not sure how much I like this now, but this is what we did for the UKLVC presentation!
equivalence_test(goat_m1)
[34m# Test for Practical Equivalence
[39m ROPE: [-0.02 0.02]
Parameter H0 inside ROPE 89% HDI
Intercept Rejected 0.00 % [ 0.83 0.94]
Log_dur_z Rejected 0.00 % [-0.04 -0.03]
agechild Rejected 0.00 % [ 0.05 0.24]
sexF Rejected 0.00 % [ 0.09 0.28]
fol_segfol_approximant Undecided 95.96 % [-0.02 0.01]
fol_segfol_nasal Undecided 57.68 % [-0.00 0.04]
fol_segfol_voiced_C Rejected 0.00 % [ 0.03 0.07]
fol_segfol_voiced_clus Undecided 8.85 % [ 0.01 0.11]
fol_segfol_voiceless_C Undecided 82.79 % [-0.01 0.03]
fol_segfol_voiceless_clus Undecided 75.96 % [-0.04 0.03]
fol_segother Undecided 86.94 % [-0.01 0.03]
agechild.sexF Rejected 0.00 % [-0.35 -0.06]
#equivalence_test(model2)
The best way to find out what’s going on is to plot the posteriors of the coefficients:
goat_posterior1 <- as.matrix(goat_m1)
#dimnames(goat_posterior1)
#color_scheme_set("brightblue")
mcmc_intervals(goat_posterior1,
pars = c("b_Log_dur_z",
"b_agechild",
"b_sexF",
"b_fol_segfol_nasal",
"b_fol_segfol_approximant",
"b_fol_segfol_voiced_C",
"b_fol_segfol_voiced_clus",
"b_fol_segfol_voiceless_C",
"b_fol_segfol_voiceless_clus",
"b_fol_segother",
"b_agechild:sexF"),
prob=0.89) + ggplot2::theme_minimal() + ggplot2::geom_vline(xintercept=0)
mcmc_areas(goat_posterior1,
pars = c("b_Log_dur_z",
"b_agechild",
"b_sexF",
"b_fol_segfol_nasal",
"b_fol_segfol_approximant",
"b_fol_segfol_voiced_C",
"b_fol_segfol_voiced_clus",
"b_fol_segfol_voiceless_C",
"b_fol_segfol_voiceless_clus",
"b_fol_segother",
"b_agechild:sexF"),
prob=0.89) + ggplot2::theme_minimal() + ggplot2::geom_vline(xintercept=0)
So what you’ll notice here:
A very narrow posterior on duration; and it is slightly negative. This means that for tokens with a longer duration, we can expect a slightly lower F2, i.e. slightly back onset to GOAT.
The posteriors on age, sex and age:sex are very wide indeed! Age (child vs. adolescent) seems to have a positive effect on F2, as does sex (F as opposed to M). For age and for sex, the 89% credible intervals do not contain 0, although the posterior distributions (i.e. 100% credible intervals) do. Children are predicted to have an F2 that is about 0.14 higher than adolescents, and adolescent girls are predicted to have an F2 in GOAT that is about 0.18 higher than adolescent boys.
The age:sex interaction is negative - although it’s difficult to be confident of that, given that the posterior is so wide! 0.14 - 0.21 = -0.07, i.e. female children are predicted to have a lower F2 in GOAT than the adolescent boys by about 0.07 – that is the median. On the other hand, the interaction effect might be 0.14 - 0.0611 = 0.08, or even 0.2405 - 0.0611 = 0.1794. So female children definitely seem to have lower F2 in GOAT than male children, but how they compare to male adolescents is a bit wobblier.
The posteriors of the various levels of following segment are narrower than age and sex, though not as narrow as duration. Mostly they are pretty close to 0. A following voiced consonant or voiced consonant cluster seems to predict a higher F2.
Let’s take a look at the posteriors for the social factors in more detail:
mcmc_areas(goat_posterior1,
pars = c("b_agechild",
"b_sexF",
"b_agechild:sexF"),
prob=0.89) + ggplot2::theme_minimal() + ggplot2::geom_vline(xintercept=0)
It’s just interesting to see where there are lumps and bumps!
Posterior predictive check
pp_check(goat_m1)
Using 10 posterior samples for ppc type 'dens_overlay' by default.
Define palette:
vowel_cols <- c("#009E73", "#CCFFCC")
transparency <- c(1, 0.2)
pd <- position_dodge(0.9)
c <- ggplot(just_goat, aes(x = sex, y = normF2_20, fill=sex)) + facet_grid(cols = vars(age)) +
geom_boxplot(width = 0.3, position = position_dodge(0.9)) +
theme_minimal() +
scale_fill_manual(values = vowel_cols) +
theme(legend.title = element_blank(),
strip.text = element_text(size = rel(3)),
legend.text = element_blank(),
axis.title = element_text(size = rel(3)),
axis.text = element_text(size = rel(3)),
plot.title = element_text(size = rel(3.5))) +
labs(x = "Age", y = "Normalized F2 at 20%", title = "GOAT: F2 at 20%")
c
Individual slopes over FOOT/GOAT/LOT - just a matter of curiosity, because both GOAT and FOOT are fronting:
goat_lot_foot <- goat3 %>% filter(vowel !="fleece")
gf_means <- goat_lot_foot %>% group_by(vowel, participant, age, sex) %>%
summarise(meanF2 = median(normF2_20))
Plot:
ggplot(gf_means, aes(x = vowel, y = meanF2, colour = vowel, group = participant)) +
geom_point(alpha = 0.5, size = 4) +
geom_line(aes(colour = age)) +
theme_minimal() + facet_grid(cols = vars(sex), rows=vars(age))
I find this interesting because what you see is that:
adolescent females have a strong pattern of having gOAT and FOOT similar in F2, then LOT much more back (and there is very little interspeaker variation in the F2 of LOT)
some of the female children have the same pattern as adolescent females, but others have an upside-down V-shape, where GOAT is front relative to both FOOT and LOT
there is a lot of interspeaker variation among male children
the adolescent boys seem to have a lot of interspeaker variation in FOOT, but all seem to have GOAT and LOT pretty close together
Subset out just GOAT and FLEECE:
table(goat3$vowel)
dress face fleece foot goat goose kit lot mouth price start strut
0 0 379 223 2892 0 0 373 0 0 0 0
trap
0
goat_fleece <- goat3 %>% filter(vowel=="goat" | vowel=="fleece")
Set reference level to GOAT:
goat_fleece$vowel <- relevel(goat_fleece$vowel, ref = "goat")
Check distribution
ggplot(goat_fleece, aes(x = norm_TL, fill=vowel)) + geom_histogram(bins = 100)
Get rid of anything with TL greater than 2
goat_fleece2 <- goat_fleece %>% filter(norm_TL < 2)
Build the model
goat_m2 <- brm(norm_TL ~ Log_dur_z + vowel*age*sex + fol_seg + (1+vowel|participant) + (1|word), data = goat_fleece2)
Compiling the C++ model
recompiling to avoid crashing R session
Start sampling
SAMPLING FOR MODEL 'd93beac45d3bb02918b6ac08195de742' NOW (CHAIN 1).
Chain 1:
Chain 1: Gradient evaluation took 0.001611 seconds
Chain 1: 1000 transitions using 10 leapfrog steps per transition would take 16.11 seconds.
Chain 1: Adjust your expectations accordingly!
Chain 1:
Chain 1:
Chain 1: Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 1: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 1: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 1: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 1: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 1: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 1: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 1: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 1: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 1: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 1: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 1: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 1:
Chain 1: Elapsed Time: 71.2235 seconds (Warm-up)
Chain 1: 21.1823 seconds (Sampling)
Chain 1: 92.4058 seconds (Total)
Chain 1:
SAMPLING FOR MODEL 'd93beac45d3bb02918b6ac08195de742' NOW (CHAIN 2).
Chain 2:
Chain 2: Gradient evaluation took 0.000841 seconds
Chain 2: 1000 transitions using 10 leapfrog steps per transition would take 8.41 seconds.
Chain 2: Adjust your expectations accordingly!
Chain 2:
Chain 2:
Chain 2: Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 2: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 2: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 2: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 2: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 2: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 2: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 2: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 2: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 2: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 2: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 2: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 2:
Chain 2: Elapsed Time: 65.1379 seconds (Warm-up)
Chain 2: 27.2946 seconds (Sampling)
Chain 2: 92.4325 seconds (Total)
Chain 2:
SAMPLING FOR MODEL 'd93beac45d3bb02918b6ac08195de742' NOW (CHAIN 3).
Chain 3:
Chain 3: Gradient evaluation took 0.000977 seconds
Chain 3: 1000 transitions using 10 leapfrog steps per transition would take 9.77 seconds.
Chain 3: Adjust your expectations accordingly!
Chain 3:
Chain 3:
Chain 3: Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 3: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 3: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 3: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 3: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 3: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 3: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 3: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 3: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 3: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 3: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 3: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 3:
Chain 3: Elapsed Time: 64.8869 seconds (Warm-up)
Chain 3: 33.1424 seconds (Sampling)
Chain 3: 98.0294 seconds (Total)
Chain 3:
SAMPLING FOR MODEL 'd93beac45d3bb02918b6ac08195de742' NOW (CHAIN 4).
Chain 4:
Chain 4: Gradient evaluation took 0.000935 seconds
Chain 4: 1000 transitions using 10 leapfrog steps per transition would take 9.35 seconds.
Chain 4: Adjust your expectations accordingly!
Chain 4:
Chain 4:
Chain 4: Iteration: 1 / 2000 [ 0%] (Warmup)
Chain 4: Iteration: 200 / 2000 [ 10%] (Warmup)
Chain 4: Iteration: 400 / 2000 [ 20%] (Warmup)
Chain 4: Iteration: 600 / 2000 [ 30%] (Warmup)
Chain 4: Iteration: 800 / 2000 [ 40%] (Warmup)
Chain 4: Iteration: 1000 / 2000 [ 50%] (Warmup)
Chain 4: Iteration: 1001 / 2000 [ 50%] (Sampling)
Chain 4: Iteration: 1200 / 2000 [ 60%] (Sampling)
Chain 4: Iteration: 1400 / 2000 [ 70%] (Sampling)
Chain 4: Iteration: 1600 / 2000 [ 80%] (Sampling)
Chain 4: Iteration: 1800 / 2000 [ 90%] (Sampling)
Chain 4: Iteration: 2000 / 2000 [100%] (Sampling)
Chain 4:
Chain 4: Elapsed Time: 65.3174 seconds (Warm-up)
Chain 4: 33.569 seconds (Sampling)
Chain 4: 98.8864 seconds (Total)
Chain 4:
Model summary:
tidy_stan(goat_m2, prob=0.89, type="fixed", digits=4)
longer object length is not a multiple of shorter object length
[34m
# Summary Statistics of Stan-Model
[39m estimate std.error HDI(89%) ratio rhat mcse
Intercept 0.3668 0.0255 [ 0.3291 0.4081] 0.2734 1.0047 0.0008
Log_dur_z 0.0467 0.0041 [ 0.0401 0.0533] 1.5432 0.9996 0.0001
Log_dur_z 0.0467 0.0041 [ 0.0401 0.0533] 1.5432 0.9996 0.0001
vowelfleece -0.0297 0.0306 [-0.0797 0.0195] 0.4190 1.0027 0.0008
agechild 0.0445 0.0386 [-0.0205 0.1055] 0.2839 1.0057 0.0012
sexF -0.0213 0.0403 [-0.0854 0.0427] 0.2815 1.0022 0.0012
fol_segfol_nasal -0.0074 0.0160 [-0.0322 0.0185] 0.6699 0.9993 0.0003
fol_segfol_pause -0.0444 0.0159 [-0.0704 -0.0192] 0.7344 0.9997 0.0003
fol_segfol_voiced_C -0.0521 0.0157 [-0.0785 -0.0274] 0.6451 0.9991 0.0003
fol_segfol_voiced_clus -0.0796 0.0365 [-0.1366 -0.0186] 1.1641 0.9997 0.0005
fol_segfol_voiceless_C -0.0439 0.0156 [-0.0680 -0.0179] 0.7151 0.9991 0.0003
fol_segfol_voiceless_clus -0.0297 0.0241 [-0.0679 0.0087] 0.8250 0.9994 0.0004
fol_segother -0.0466 0.0142 [-0.0691 -0.0241] 0.6800 0.9993 0.0003
vowelfleece.agechild 0.0070 0.0518 [-0.0754 0.0870] 0.4122 1.0011 0.0013
vowelfleece.sexF 0.1001 0.0533 [ 0.0143 0.1891] 0.4374 1.0005 0.0013
agechild.sexF 0.0158 0.0618 [-0.0847 0.1141] 0.2485 1.0059 0.0020
vowelfleece.agechild.sexF -0.1089 0.0806 [-0.2385 0.0259] 0.3972 1.0014 0.0021
Plot the posteriors of the coefficients:
goat_posterior2 <- as.matrix(goat_m2)
#dimnames(goat_posterior2)
#color_scheme_set("brightblue")
mcmc_intervals(goat_posterior2,
pars = c("b_Log_dur_z",
"b_vowelfleece",
"b_agechild",
"b_sexF",
"b_fol_segfol_nasal",
"b_fol_segfol_approximant",
"b_fol_segfol_voiced_C",
"b_fol_segfol_voiced_clus",
"b_fol_segfol_voiceless_C",
"b_fol_segfol_voiceless_clus",
"b_fol_segother",
"b_agechild:sexF"),
prob=0.89) + ggplot2::theme_minimal() + ggplot2::geom_vline(xintercept=0)
mcmc_areas(goat_posterior2,
pars = c("b_Log_dur_z",
"b_vowelfleece",
"b_agechild",
"b_sexF",
"b_fol_segfol_nasal",
"b_fol_segfol_approximant",
"b_fol_segfol_voiced_C",
"b_fol_segfol_voiced_clus",
"b_fol_segfol_voiceless_C",
"b_fol_segfol_voiceless_clus",
"b_fol_segother",
"b_agechild:sexF"),
prob=0.89) + ggplot2::theme_minimal() + ggplot2::geom_vline(xintercept=0)
So the results show:
A longer duration predicts a greater TL. A 1 standard deviation increase in duration predicts a 0.04–0.05 increase in TL.
FLEECE tends to have a shorter Trajectory Length than GOAT, but we can’t be super confident of whether there is an effect: the 89% credible interval contains 0 and its upper bound is 0.02, i.e. FLEECE might actually have a longer TL than GOAT.
A following nasal or approximant predicts a greater Trajectory Length. Otherwise, the following segments don’t seem to have much of an effect: their posterior medians are close to 0 and their 89% credible intervals contain 0.
The posteriors on the language-external effects (sex, age and their interaction) are very wide, and the credible intervals contain 0. Being a child could predict a shorter TL by 0.02, or a longer TL by 0.11! It looks like being a child predicts a longer Trajectory Length, being female predicts a shorter Trajectory Length, and as to the interaction, it’s not clear what’s going on there!
Plot gOAT Traj
vowel_cols <- c("#009E73", "#E69F00")
Plot it:
a <- ggplot(goat_fleece2, aes(x = sex, y = norm_TL, fill = vowel)) + facet_grid(cols = vars(age)) +
geom_boxplot(width = 0.3, position = position_dodge(0.9)) +
theme_minimal() +
scale_fill_manual(values = vowel_cols) +
theme(legend.title = element_text(size = rel(3)),
strip.text = element_text(size = rel(3)),
legend.text = element_text(size = rel(3)),
axis.title = element_text(size = rel(3)),
axis.text = element_text(size = rel(3)),
plot.title = element_text(size = rel(3.5))) +
labs(x = "Age", y = "Trajectory Length", title = "GOAT: Trajectory Length") +
ylim(0, 2)
a