Infant Effect Sizes Word Properties

#packages
library(tidyverse)
library(cowplot)
library(knitr)
library(htmltools)
library(glue)
theme_set(theme_cowplot())

###function for adding html audio
html_tag_audio <- function(file, type = "wav") {
  type <- match.arg(type)
  htmltools::tags$audio(
    controls = NA,
    htmltools::tags$source(
      src = file,
      type = glue::glue("audio/{type}", type = type)
    )
  )
}

#read in data
d <- read.csv("WordProperties - phonotactic probability.csv")
pitch <- read.csv("pitch_lists.csv")
#pitch$word <- pitch$item
d <- d %>%
  left_join(pitch)

#clean up the data frame
d <- d %>%
  rename(seg_pp=adult_corpus_avg_segment_phonotactic_probability,bi_pp=adult_corpus_avg_biphone_phonotactic.probability)


# define pairs for each list and add info to data frame
#list1=c("toma","manu","fiffin","kita","modi","regli", "tosip","coodle")
#list2=c("blicket","sarel","boskot","koba","tever", "chatten","jefa",
#        "fuppy")
list1=c("toma","manu","fiffin","modi","kita", "chatten","gazzer","jefa")
list2=c("blicket","sarel","koba","tever","boskot","regli" ,"doopy","pizer")

background_items=c("fiffin","tosip","regli","doopy","pizer","virdex")


test_items_1 <- c("manu","kita")
test_items_2 <- c("sarel","boskot")

list1 <- c(background_items,test_items_1)
list2 <- c(background_items,test_items_2)

d$new_list <- ifelse(d$word %in% background_items, "background_items",
                     ifelse(d$word %in% test_items_1, "test_items_1",
                            ifelse(d$word %in% test_items_2, "test_items_2","not_used")))

#create phonotactics summary values for each list
sum_d_phon_list1 <- d %>%
  filter(word %in% list1) %>%
  summarize(
    mean_seg=mean(seg_pp),
    median_seg=median(seg_pp),
    sd_seg=sd(seg_pp),
    mean_bi=mean(bi_pp),
    median_bi=median(bi_pp),
    sd_bi=sd(bi_pp),
    max_seg=max(seg_pp),
    min_seg=min(seg_pp),
    max_bi=max(bi_pp),
    min_bi=min(bi_pp),
    new_list="list1"
  )

sum_d_phon_list2 <- d %>%
  filter(word %in% list2) %>%
  summarize(
    mean_seg=mean(seg_pp),
    median_seg=median(seg_pp),
    sd_seg=sd(seg_pp),
    mean_bi=mean(bi_pp),
    median_bi=median(bi_pp),
    sd_bi=sd(bi_pp),
    max_seg=max(seg_pp),
    min_seg=min(seg_pp),
    max_bi=max(bi_pp),
    min_bi=min(bi_pp),
    new_list="list2"
  )

sum_d_phon <- bind_rows(sum_d_phon_list1,sum_d_phon_list2)

#create pitch summary values for each list
sum_d_pitch_list1 <- d %>%
  filter(word %in% list1)%>%
  summarize(
    mean_f0_mean=mean(f0_mean),
    median_f0_mean=median(f0_mean),
    sd_f0_mean=sd(f0_mean),
    mean_f0_min=mean(f0_min),
    median_f0_min=median(f0_min),
    sd_f0_min=sd(f0_min),
    mean_f0_max=mean(f0_max),
    median_f0_max=median(f0_max),
    sd_f0_max=sd(f0_max),
    new_list="list1"
  )
sum_d_pitch_list2 <- d %>%
  filter(word %in% list2)%>%
  summarize(
    mean_f0_mean=mean(f0_mean),
    median_f0_mean=median(f0_mean),
    sd_f0_mean=sd(f0_mean),
    mean_f0_min=mean(f0_min),
    median_f0_min=median(f0_min),
    sd_f0_min=sd(f0_min),
    mean_f0_max=mean(f0_max),
    median_f0_max=median(f0_max),
    sd_f0_max=sd(f0_max),
    new_list="list2"
  )

sum_d_pitch <- bind_rows(sum_d_pitch_list1,sum_d_pitch_list2)

Overview over new design concept - “background items” + test item pairs

Design Concept

The Words

#create audio files in a loop
audio_list_1=paste('<audio controls><source src="audio/',list1,'.wav" type="audio/wav"/></audio>',sep="")
audio_list_2=paste('<audio controls><source src="audio/',list2,'.wav" type="audio/wav"/></audio>',sep="")
list_d <- data.frame(
  list_1=list1,
  audio_list_1=audio_list_1,
  list_2=list2,
  audio_list_2=audio_list_2)

kable(list_d)

list_1	audio_list_1	list_2	audio_list_2
fiffin		fiffin
tosip		tosip
regli		regli
doopy		doopy
pizer		pizer
virdex		virdex
manu		sarel
kita		boskot

Initial and final sounds

Word-final and word-initial sounds are pretty well branched out (coded in Klattese, so in some cases essentially final syllables).

p1 <- ggplot(filter(d,word %in% c(list1,list2)),aes(init_sound))+
  geom_histogram(stat="count")+
    ggtitle("INITITAL SOUNDS")
p2 <- ggplot(filter(d,word %in% c(list1,list2)),aes(final_sound))+
  geom_histogram(stat="count")+
  ggtitle("FINAL SOUNDS")

plot_grid(p1,p2,ncol=2)

Properties of the test items only

The only difference in the test items that stands out to me whjen comparing the pairings (manu vs. sarel; kita vs. boskot) is that “kita” has a higher F0 peak. I don’t find that it stands out too much to my ear when listening to the items, so I’m leaning towards this being ok? (No combo of the currently recorded items is going to be perfectly matched in every way we care about).

kable(subset(d, word %in% c(test_items_1,test_items_2),select=c(word,seg_pp,bi_pp,f0_mean,f0_min,f0_max)),col.names=c("Word","segment probability","biphone probability","F0 Mean","F0 Minimum","F0 Maximum"),row.names=NA, digits = 5)

	Word	segment probability	biphone probability	F0 Mean	F0 Minimum	F0 Maximum
9	boskot	0.0580	0.00276	203.21	133.54	308.63
10	kita	0.0676	0.00233	210.60	130.48	357.36
21	manu	0.0610	0.00830	190.12	135.36	281.59
24	sarel	0.0660	0.00650	192.30	133.99	290.16

Summarize Properties of Words By List

Phonotactic probability

The table below compares the two lists in terms of phonotactic probability (seg==segment (roughly speaking, phone) probability; bi == biphone probability).

kable(filter(sum_d_phon, new_list!="not_used"), digits = 5)

mean_seg	median_seg	sd_seg	mean_bi	median_bi	sd_bi	max_seg	min_seg	max_bi	min_bi	new_list
0.04965	0.0489	0.01199	0.00300	0.00240	0.00228	0.0676	0.0315	0.0083	0.00127	list1
0.04908	0.0489	0.01127	0.00282	0.00262	0.00168	0.0660	0.0315	0.0065	0.00127	list2

Pitch

kable(filter(sum_d_pitch, new_list!="not_used"), digits = 5)

mean_f0_mean	median_f0_mean	sd_f0_mean	mean_f0_min	median_f0_min	sd_f0_min	mean_f0_max	median_f0_max	sd_f0_max	new_list
203.5525	193.880	23.86189	132.0725	130.125	4.37394	341.765	356.195	43.19967	list1
202.9013	194.125	23.54016	132.2837	131.655	4.24347	336.745	331.830	42.69267	list2

Plotting the segment and biphone phonotactic probability

ggplot(filter(d,new_list!="not_used"),aes(seg_pp,bi_pp, label=word, color=new_list)) +
  geom_point()+
  geom_label()+
  geom_point(data=filter(d,new_list=="not_used"), alpha=0.5)+
  geom_text(data=filter(d,new_list=="not_used"), alpha=0.5)+
  xlab("average segment probability")+
  ylab("average biphone phonotactive probability")+
  theme(legend.position=c(0.2,0.8))