#packages
library(tidyverse)
library(cowplot)
library(knitr)
library(htmltools)
library(glue)
theme_set(theme_cowplot())
###function for adding html audio
html_tag_audio <- function(file, type = "wav") {
type <- match.arg(type)
htmltools::tags$audio(
controls = NA,
htmltools::tags$source(
src = file,
type = glue::glue("audio/{type}", type = type)
)
)
}
#read in data
d <- read.csv("WordProperties - phonotactic probability.csv")
pitch <- read.csv("pitch_lists.csv")
#pitch$word <- pitch$item
d <- d %>%
left_join(pitch)
#clean up the data frame
d <- d %>%
rename(seg_pp=adult_corpus_avg_segment_phonotactic_probability,bi_pp=adult_corpus_avg_biphone_phonotactic.probability)
# define pairs for each list and add info to data frame
#list1=c("toma","manu","fiffin","kita","modi","regli", "tosip","coodle")
#list2=c("blicket","sarel","boskot","koba","tever", "chatten","jefa",
# "fuppy")
list1=c("toma","manu","fiffin","modi","kita", "chatten","gazzer","jefa")
list2=c("blicket","sarel","koba","tever","boskot","regli" ,"doopy","pizer")
background_items=c("fiffin","tosip","regli","doopy","pizer","virdex")
test_items_1 <- c("manu","kita")
test_items_2 <- c("sarel","boskot")
list1 <- c(background_items,test_items_1)
list2 <- c(background_items,test_items_2)
d$new_list <- ifelse(d$word %in% background_items, "background_items",
ifelse(d$word %in% test_items_1, "test_items_1",
ifelse(d$word %in% test_items_2, "test_items_2","not_used")))
#create phonotactics summary values for each list
sum_d_phon_list1 <- d %>%
filter(word %in% list1) %>%
summarize(
mean_seg=mean(seg_pp),
median_seg=median(seg_pp),
sd_seg=sd(seg_pp),
mean_bi=mean(bi_pp),
median_bi=median(bi_pp),
sd_bi=sd(bi_pp),
max_seg=max(seg_pp),
min_seg=min(seg_pp),
max_bi=max(bi_pp),
min_bi=min(bi_pp),
new_list="list1"
)
sum_d_phon_list2 <- d %>%
filter(word %in% list2) %>%
summarize(
mean_seg=mean(seg_pp),
median_seg=median(seg_pp),
sd_seg=sd(seg_pp),
mean_bi=mean(bi_pp),
median_bi=median(bi_pp),
sd_bi=sd(bi_pp),
max_seg=max(seg_pp),
min_seg=min(seg_pp),
max_bi=max(bi_pp),
min_bi=min(bi_pp),
new_list="list2"
)
sum_d_phon <- bind_rows(sum_d_phon_list1,sum_d_phon_list2)
#create pitch summary values for each list
sum_d_pitch_list1 <- d %>%
filter(word %in% list1)%>%
summarize(
mean_f0_mean=mean(f0_mean),
median_f0_mean=median(f0_mean),
sd_f0_mean=sd(f0_mean),
mean_f0_min=mean(f0_min),
median_f0_min=median(f0_min),
sd_f0_min=sd(f0_min),
mean_f0_max=mean(f0_max),
median_f0_max=median(f0_max),
sd_f0_max=sd(f0_max),
new_list="list1"
)
sum_d_pitch_list2 <- d %>%
filter(word %in% list2)%>%
summarize(
mean_f0_mean=mean(f0_mean),
median_f0_mean=median(f0_mean),
sd_f0_mean=sd(f0_mean),
mean_f0_min=mean(f0_min),
median_f0_min=median(f0_min),
sd_f0_min=sd(f0_min),
mean_f0_max=mean(f0_max),
median_f0_max=median(f0_max),
sd_f0_max=sd(f0_max),
new_list="list2"
)
sum_d_pitch <- bind_rows(sum_d_pitch_list1,sum_d_pitch_list2)
Design Concept
#create audio files in a loop
audio_list_1=paste('<audio controls><source src="audio/',list1,'.wav" type="audio/wav"/></audio>',sep="")
audio_list_2=paste('<audio controls><source src="audio/',list2,'.wav" type="audio/wav"/></audio>',sep="")
list_d <- data.frame(
list_1=list1,
audio_list_1=audio_list_1,
list_2=list2,
audio_list_2=audio_list_2)
kable(list_d)
| list_1 | audio_list_1 | list_2 | audio_list_2 |
|---|---|---|---|
| fiffin | fiffin | ||
| tosip | tosip | ||
| regli | regli | ||
| doopy | doopy | ||
| pizer | pizer | ||
| virdex | virdex | ||
| manu | sarel | ||
| kita | boskot |
Word-final and word-initial sounds are pretty well branched out (coded in Klattese, so in some cases essentially final syllables).
p1 <- ggplot(filter(d,word %in% c(list1,list2)),aes(init_sound))+
geom_histogram(stat="count")+
ggtitle("INITITAL SOUNDS")
p2 <- ggplot(filter(d,word %in% c(list1,list2)),aes(final_sound))+
geom_histogram(stat="count")+
ggtitle("FINAL SOUNDS")
plot_grid(p1,p2,ncol=2)
The only difference in the test items that stands out to me whjen comparing the pairings (manu vs. sarel; kita vs. boskot) is that “kita” has a higher F0 peak. I don’t find that it stands out too much to my ear when listening to the items, so I’m leaning towards this being ok? (No combo of the currently recorded items is going to be perfectly matched in every way we care about).
kable(subset(d, word %in% c(test_items_1,test_items_2),select=c(word,seg_pp,bi_pp,f0_mean,f0_min,f0_max)),col.names=c("Word","segment probability","biphone probability","F0 Mean","F0 Minimum","F0 Maximum"),row.names=NA, digits = 5)
| Word | segment probability | biphone probability | F0 Mean | F0 Minimum | F0 Maximum | |
|---|---|---|---|---|---|---|
| 9 | boskot | 0.0580 | 0.00276 | 203.21 | 133.54 | 308.63 |
| 10 | kita | 0.0676 | 0.00233 | 210.60 | 130.48 | 357.36 |
| 21 | manu | 0.0610 | 0.00830 | 190.12 | 135.36 | 281.59 |
| 24 | sarel | 0.0660 | 0.00650 | 192.30 | 133.99 | 290.16 |
The table below compares the two lists in terms of phonotactic probability (seg==segment (roughly speaking, phone) probability; bi == biphone probability).
kable(filter(sum_d_phon, new_list!="not_used"), digits = 5)
| mean_seg | median_seg | sd_seg | mean_bi | median_bi | sd_bi | max_seg | min_seg | max_bi | min_bi | new_list |
|---|---|---|---|---|---|---|---|---|---|---|
| 0.04965 | 0.0489 | 0.01199 | 0.00300 | 0.00240 | 0.00228 | 0.0676 | 0.0315 | 0.0083 | 0.00127 | list1 |
| 0.04908 | 0.0489 | 0.01127 | 0.00282 | 0.00262 | 0.00168 | 0.0660 | 0.0315 | 0.0065 | 0.00127 | list2 |
kable(filter(sum_d_pitch, new_list!="not_used"), digits = 5)
| mean_f0_mean | median_f0_mean | sd_f0_mean | mean_f0_min | median_f0_min | sd_f0_min | mean_f0_max | median_f0_max | sd_f0_max | new_list |
|---|---|---|---|---|---|---|---|---|---|
| 203.5525 | 193.880 | 23.86189 | 132.0725 | 130.125 | 4.37394 | 341.765 | 356.195 | 43.19967 | list1 |
| 202.9013 | 194.125 | 23.54016 | 132.2837 | 131.655 | 4.24347 | 336.745 | 331.830 | 42.69267 | list2 |
ggplot(filter(d,new_list!="not_used"),aes(seg_pp,bi_pp, label=word, color=new_list)) +
geom_point()+
geom_label()+
geom_point(data=filter(d,new_list=="not_used"), alpha=0.5)+
geom_text(data=filter(d,new_list=="not_used"), alpha=0.5)+
xlab("average segment probability")+
ylab("average biphone phonotactive probability")+
theme(legend.position=c(0.2,0.8))