#packages
library(tidyverse)
library(cowplot)
library(knitr)
library(htmltools)
library(glue)
theme_set(theme_cowplot())

###function for adding html audio
html_tag_audio <- function(file, type = "wav") {
  type <- match.arg(type)
  htmltools::tags$audio(
    controls = NA,
    htmltools::tags$source(
      src = file,
      type = glue::glue("audio/{type}", type = type)
    )
  )
}

#read in data
d <- read.csv("WordProperties - phonotactic probability.csv")
pitch <- read.csv("pitch_lists.csv")
#pitch$word <- pitch$item
d <- d %>%
  left_join(pitch)

#clean up the data frame
d <- d %>%
  rename(seg_pp=adult_corpus_avg_segment_phonotactic_probability,bi_pp=adult_corpus_avg_biphone_phonotactic.probability)


# define pairs for each list and add info to data frame
#list1=c("toma","manu","fiffin","kita","modi","regli", "tosip","coodle")
#list2=c("blicket","sarel","boskot","koba","tever", "chatten","jefa",
#        "fuppy")
list1=c("toma","manu","fiffin","modi","kita", "chatten","gazzer","jefa")
list2=c("blicket","sarel","koba","tever","boskot","regli" ,"doopy","pizer")

background_items=c("fiffin","tosip","regli","doopy","pizer","virdex")


test_items_1 <- c("manu","kita")
test_items_2 <- c("sarel","boskot")

list1 <- c(background_items,test_items_1)
list2 <- c(background_items,test_items_2)

d$new_list <- ifelse(d$word %in% background_items, "background_items",
                     ifelse(d$word %in% test_items_1, "test_items_1",
                            ifelse(d$word %in% test_items_2, "test_items_2","not_used")))

#create phonotactics summary values for each list
sum_d_phon_list1 <- d %>%
  filter(word %in% list1) %>%
  summarize(
    mean_seg=mean(seg_pp),
    median_seg=median(seg_pp),
    sd_seg=sd(seg_pp),
    mean_bi=mean(bi_pp),
    median_bi=median(bi_pp),
    sd_bi=sd(bi_pp),
    max_seg=max(seg_pp),
    min_seg=min(seg_pp),
    max_bi=max(bi_pp),
    min_bi=min(bi_pp),
    new_list="list1"
  )

sum_d_phon_list2 <- d %>%
  filter(word %in% list2) %>%
  summarize(
    mean_seg=mean(seg_pp),
    median_seg=median(seg_pp),
    sd_seg=sd(seg_pp),
    mean_bi=mean(bi_pp),
    median_bi=median(bi_pp),
    sd_bi=sd(bi_pp),
    max_seg=max(seg_pp),
    min_seg=min(seg_pp),
    max_bi=max(bi_pp),
    min_bi=min(bi_pp),
    new_list="list2"
  )

sum_d_phon <- bind_rows(sum_d_phon_list1,sum_d_phon_list2)

#create pitch summary values for each list
sum_d_pitch_list1 <- d %>%
  filter(word %in% list1)%>%
  summarize(
    mean_f0_mean=mean(f0_mean),
    median_f0_mean=median(f0_mean),
    sd_f0_mean=sd(f0_mean),
    mean_f0_min=mean(f0_min),
    median_f0_min=median(f0_min),
    sd_f0_min=sd(f0_min),
    mean_f0_max=mean(f0_max),
    median_f0_max=median(f0_max),
    sd_f0_max=sd(f0_max),
    new_list="list1"
  )
sum_d_pitch_list2 <- d %>%
  filter(word %in% list2)%>%
  summarize(
    mean_f0_mean=mean(f0_mean),
    median_f0_mean=median(f0_mean),
    sd_f0_mean=sd(f0_mean),
    mean_f0_min=mean(f0_min),
    median_f0_min=median(f0_min),
    sd_f0_min=sd(f0_min),
    mean_f0_max=mean(f0_max),
    median_f0_max=median(f0_max),
    sd_f0_max=sd(f0_max),
    new_list="list2"
  )

sum_d_pitch <- bind_rows(sum_d_pitch_list1,sum_d_pitch_list2)

Overview over new design concept - “background items” + test item pairs

Design Concept

Design Concept

The Words

#create audio files in a loop
audio_list_1=paste('<audio controls><source src="audio/',list1,'.wav" type="audio/wav"/></audio>',sep="")
audio_list_2=paste('<audio controls><source src="audio/',list2,'.wav" type="audio/wav"/></audio>',sep="")
list_d <- data.frame(
  list_1=list1,
  audio_list_1=audio_list_1,
  list_2=list2,
  audio_list_2=audio_list_2)

kable(list_d)
list_1 audio_list_1 list_2 audio_list_2
fiffin fiffin
tosip tosip
regli regli
doopy doopy
pizer pizer
virdex virdex
manu sarel
kita boskot

Initial and final sounds

Word-final and word-initial sounds are pretty well branched out (coded in Klattese, so in some cases essentially final syllables).

p1 <- ggplot(filter(d,word %in% c(list1,list2)),aes(init_sound))+
  geom_histogram(stat="count")+
    ggtitle("INITITAL SOUNDS")
p2 <- ggplot(filter(d,word %in% c(list1,list2)),aes(final_sound))+
  geom_histogram(stat="count")+
  ggtitle("FINAL SOUNDS")

plot_grid(p1,p2,ncol=2)

Properties of the test items only

The only difference in the test items that stands out to me whjen comparing the pairings (manu vs. sarel; kita vs. boskot) is that “kita” has a higher F0 peak. I don’t find that it stands out too much to my ear when listening to the items, so I’m leaning towards this being ok? (No combo of the currently recorded items is going to be perfectly matched in every way we care about).

kable(subset(d, word %in% c(test_items_1,test_items_2),select=c(word,seg_pp,bi_pp,f0_mean,f0_min,f0_max)),col.names=c("Word","segment probability","biphone probability","F0 Mean","F0 Minimum","F0 Maximum"),row.names=NA, digits = 5)
Word segment probability biphone probability F0 Mean F0 Minimum F0 Maximum
9 boskot 0.0580 0.00276 203.21 133.54 308.63
10 kita 0.0676 0.00233 210.60 130.48 357.36
21 manu 0.0610 0.00830 190.12 135.36 281.59
24 sarel 0.0660 0.00650 192.30 133.99 290.16

Summarize Properties of Words By List

Phonotactic probability

The table below compares the two lists in terms of phonotactic probability (seg==segment (roughly speaking, phone) probability; bi == biphone probability).

kable(filter(sum_d_phon, new_list!="not_used"), digits = 5)
mean_seg median_seg sd_seg mean_bi median_bi sd_bi max_seg min_seg max_bi min_bi new_list
0.04965 0.0489 0.01199 0.00300 0.00240 0.00228 0.0676 0.0315 0.0083 0.00127 list1
0.04908 0.0489 0.01127 0.00282 0.00262 0.00168 0.0660 0.0315 0.0065 0.00127 list2

Pitch

kable(filter(sum_d_pitch, new_list!="not_used"), digits = 5)
mean_f0_mean median_f0_mean sd_f0_mean mean_f0_min median_f0_min sd_f0_min mean_f0_max median_f0_max sd_f0_max new_list
203.5525 193.880 23.86189 132.0725 130.125 4.37394 341.765 356.195 43.19967 list1
202.9013 194.125 23.54016 132.2837 131.655 4.24347 336.745 331.830 42.69267 list2

Plotting the segment and biphone phonotactic probability

ggplot(filter(d,new_list!="not_used"),aes(seg_pp,bi_pp, label=word, color=new_list)) +
  geom_point()+
  geom_label()+
  geom_point(data=filter(d,new_list=="not_used"), alpha=0.5)+
  geom_text(data=filter(d,new_list=="not_used"), alpha=0.5)+
  xlab("average segment probability")+
  ylab("average biphone phonotactive probability")+
  theme(legend.position=c(0.2,0.8))