library(tidyverse)
library(mgcv)
library(ggplot2)
A function for loading a single tracks file.
read_filter <- function(filename){
df <- read_delim(filename, delim = "\t")
out <- df %>% filter(plt_vclass == "aw", fol_seg != "L") %>%
select(age, year, id, F1, F2, t, context, ethnicity, word, fol_seg)
}
This takes forever, cause the eventual data read in is 265.2 Mb, over a network attached drive.
all_aw_list <- map(all_tracks, read_filter)
Cleaning and processing.
add_filename <- function(df, filename){
df$idstring = basename(filename)
return(df)
}
all_aw_list <- map2(all_aw_list, all_tracks, add_filename)
all_aw_dat <- bind_rows(all_aw_list)
Focusing on data from White Philadelphians
dat_to_use <- all_aw_dat %>%
filter(!grepl("a|h", ethnicity),
!grepl(verboten_string(), idstring)) %>%
group_by(idstring, id)%>%
mutate(rel_t = t-min(t),
prop_t = rel_t/max(rel_t))%>%
slice(floor(seq(1, n(), length = 20)))
pnc_demo <- read.delim("/Volumes/jfruehwa/PNC/PNC_corpus_info.txt")
pnc_demo %<>%
mutate(idstring = gsub("(PH[0-9]+-[0-9]+-[0-9]+-).*", "\\1", speaker))
dat_to_use %<>%
ungroup()%>%
mutate(idstring = gsub("(PH[0-9]+-[0-9]+-[0-9]+-).*", "\\1", idstring))
Recoding the following segment to voicing context
dat_to_use %<>%
left_join(pnc_demo %>% select(idstring, sex))%>%
filter(!grepl("[AEIOU]", fol_seg))%>%
mutate(dob = year-age,
era = cut(dob, 3, dig.lab=4),
voicing = recode(fol_seg, M = "nasal", N = "nasal", NG = "nasal",
P = "voiceless", `T` = "voiceless", K = "voiceless",
`F` = "voiceless", TH = "voiceless", S = "voiceless",
SH = "voiceless", CH = "voiceless",
.default = "voiced") %>% factor() %>% relevel("voiced"))
Joining, by = c("idstring", "sex")
dat_to_use %>%
group_by(idstring, id, sex) %>%
mutate(meas = 1:n()) %>%
group_by(sex, meas,era, voicing, idstring)%>%
summarise(F1 = mean(F1, na.rm = T),
F2 = mean(F2, na.rm = T))%>%
summarise(F1 = mean(F1, na.rm = T),
F2 = mean(F2, na.rm = T))%>%
ggplot(aes(meas, F1))+
geom_line(aes(color = voicing))+
facet_grid(era~sex)+
theme_bw()
Just to keep things simple & avoid normalization for now, just female data.
f_dat <- dat_to_use %>% filter(sex == "f")
Fit the basic model with a smooth for voiced, voiceless and nasal. Voiceless and Nasal are entered as difference curves. Random effects for speaker and word.
Ideally this would also have random smooths by speaker.
f_dat %<>%
mutate(voiceless = (voicing == "voiceless")*1,
nasal = (voicing == "nasal")*1,
idstring = factor(idstring),
word = factor(word))
f_mod <- bam(F1 ~ s(prop_t) +
s(prop_t, by = voiceless) +
s(prop_t, by = nasal)+
s(idstring, bs = 're')+
s(word, bs = 're'),
data = f_dat)
library(itsadug)
Loading required package: plotfunctions
Attaching package: ‘plotfunctions’
The following object is masked from ‘package:ggplot2’:
alpha
Loaded package itsadug 2.2 (see 'help("itsadug")' ).
summary(f_mod)
Family: gaussian
Link function: identity
Formula:
F1 ~ s(prop_t) + s(prop_t, by = voiceless) + s(prop_t, by = nasal) +
s(idstring, bs = "re") + s(word, bs = "re")
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 767.722 6.425 119.5 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(prop_t) 8.052 8.613 446.3 <2e-16 ***
s(prop_t):voiceless 5.502 6.439 277.0 <2e-16 ***
s(prop_t):nasal 8.587 9.412 293.8 <2e-16 ***
s(idstring) 177.969 179.000 4638.2 <2e-16 ***
s(word) 253.881 270.000 602.2 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
R-sq.(adj) = 0.318 Deviance explained = 32%
fREML = 1.3102e+06 Scale est. = 12178 n = 213840
This plots the difference of the voiced curve from the intercept (767 hz).
plot(f_mod, select = 1, main = "voiced curve", ylim = c(-100,100))
abline(h = 0, col = 'red')
Difference between voiced and voiceless
plot(f_mod, select = 2, main = "voiceless effect", ylim = c(-100,100))
abline(h = 0, col = 'red')
difference between voiced and nasal
plot(f_mod, select = 3, main = "nasal effect", ylim = c(-100,100))
abline(h = 0, col = 'red')
Summed effects
plot_smooth(f_mod, view = "prop_t", cond= list(nasal = 0, voiceless = 0),
rm.ranef = T,
ylim = c(660, 850))
Summary:
* prop_t : numeric predictor; with 30 values ranging from 0.000000 to 1.000000.
* voiceless : numeric predictor; set to the value(s): 0.
* nasal : numeric predictor; set to the value(s): 0.
* idstring : factor; set to the value(s): PH73-5-1-. (Might be canceled as random effect, check below.)
* word : factor; set to the value(s): OUT. (Might be canceled as random effect, check below.)
* NOTE : The following random effects columns are canceled: s(idstring),s(word)
plot_smooth(f_mod, view = "prop_t", cond= list(nasal = 0 , voiceless = 1), add = T,
rm.ranef = T,
col = 'red')
Summary:
* prop_t : numeric predictor; with 30 values ranging from 0.000000 to 1.000000.
* voiceless : numeric predictor; set to the value(s): 1.
* nasal : numeric predictor; set to the value(s): 0.
* idstring : factor; set to the value(s): PH73-5-1-. (Might be canceled as random effect, check below.)
* word : factor; set to the value(s): OUT. (Might be canceled as random effect, check below.)
* NOTE : The following random effects columns are canceled: s(idstring),s(word)
plot_smooth(f_mod, view = "prop_t", cond= list(nasal = 1 , voiceless = 0), add = T,
rm.ranef = T,
col = 'blue')
Summary:
* prop_t : numeric predictor; with 30 values ranging from 0.000000 to 1.000000.
* voiceless : numeric predictor; set to the value(s): 0.
* nasal : numeric predictor; set to the value(s): 1.
* idstring : factor; set to the value(s): PH73-5-1-. (Might be canceled as random effect, check below.)
* word : factor; set to the value(s): OUT. (Might be canceled as random effect, check below.)
* NOTE : The following random effects columns are canceled: s(idstring),s(word)
There is a high degree of autocorrelation in the residuals.
acf_resid(f_mod, split_pred=list(f_dat$idstring, f_dat$word), main="ACF resid")
Get the acf at lag = 1
(valRho <- acf_resid(f_mod, split_pred = list(f_dat$idstring,
f_dat$word),
plot=FALSE)[2])
1
0.8294303
Create a dummy variable that is T at the start of each vowel token, and F otherwise. Include the AR.start and rho arguments
f_dat %<>%
group_by(idstring, id)%>%
mutate(meas = 1:n(),
start = meas == 1)
f_mod_ar <- bam(F1 ~ s(prop_t) +
s(prop_t, by = voiceless) +
s(prop_t, by = nasal)+
s(idstring, bs = 're')+
s(word, bs = 're'),
AR.start = f_dat$start,
rho = valRho,
data = f_dat)
summary(f_mod_ar)
Family: gaussian
Link function: identity
Formula:
F1 ~ s(prop_t) + s(prop_t, by = voiceless) + s(prop_t, by = nasal) +
s(idstring, bs = "re") + s(word, bs = "re")
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 770.480 6.223 123.8 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(prop_t) 8.740 8.931 541.4 <2e-16 ***
s(prop_t):voiceless 8.493 9.480 126.1 <2e-16 ***
s(prop_t):nasal 9.760 9.957 184.7 <2e-16 ***
s(idstring) 176.227 179.000 530.9 <2e-16 ***
s(word) 218.541 270.000 109.0 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
R-sq.(adj) = 0.314 Deviance explained = 31.5%
fREML = 1.0852e+06 Scale est. = 4498 n = 213840
plot(f_mod_ar)
plot_smooth(f_mod_ar, view = "prop_t", cond= list(nasal = 0, voiceless = 0),
rm.ranef = T,
ylim = c(650, 850))
Summary:
* prop_t : numeric predictor; with 30 values ranging from 0.000000 to 1.000000.
* voiceless : numeric predictor; set to the value(s): 0.
* nasal : numeric predictor; set to the value(s): 0.
* idstring : factor; set to the value(s): PH73-5-1-. (Might be canceled as random effect, check below.)
* word : factor; set to the value(s): OUT. (Might be canceled as random effect, check below.)
* NOTE : The following random effects columns are canceled: s(idstring),s(word)
plot_smooth(f_mod_ar, view = "prop_t", cond= list(nasal = 0 , voiceless = 1), add = T,
rm.ranef = T,
col = 'red')
Summary:
* prop_t : numeric predictor; with 30 values ranging from 0.000000 to 1.000000.
* voiceless : numeric predictor; set to the value(s): 1.
* nasal : numeric predictor; set to the value(s): 0.
* idstring : factor; set to the value(s): PH73-5-1-. (Might be canceled as random effect, check below.)
* word : factor; set to the value(s): OUT. (Might be canceled as random effect, check below.)
* NOTE : The following random effects columns are canceled: s(idstring),s(word)
plot_smooth(f_mod_ar, view = "prop_t", cond= list(nasal = 1 , voiceless = 0), add = T,
rm.ranef = T,
col = 'blue')
Summary:
* prop_t : numeric predictor; with 30 values ranging from 0.000000 to 1.000000.
* voiceless : numeric predictor; set to the value(s): 0.
* nasal : numeric predictor; set to the value(s): 1.
* idstring : factor; set to the value(s): PH73-5-1-. (Might be canceled as random effect, check below.)
* word : factor; set to the value(s): OUT. (Might be canceled as random effect, check below.)
* NOTE : The following random effects columns are canceled: s(idstring),s(word)
acf_resid(f_mod_ar, split_pred = "AR.start")